diff --git a/django-verdant/rca/management/commands/import_redirects.py b/django-verdant/rca/management/commands/import_redirects.py new file mode 100644 index 000000000..b713352ad --- /dev/null +++ b/django-verdant/rca/management/commands/import_redirects.py @@ -0,0 +1,113 @@ +from csv import DictReader + +from django.core.management.base import BaseCommand, CommandError +from django.db import transaction +from django.utils.six.moves.urllib.parse import urlparse + +from wagtail.wagtailcore.models import Site, Page +from wagtail.wagtailredirects.models import Redirect + + +def get_page_from_path(path): + """ Takes a full url. Roughly reproduces wagtail.wagtailcore.views.serve. + """ + parsed_path = urlparse(path) + site = Site.objects.get(hostname=parsed_path.netloc) + + path_components = [component for component in parsed_path.path.split('/') + if component] + page = site.root_page + while path_components: + child_slug = path_components[0] + path_components = path_components[1:] + page = page.get_children().get(slug=child_slug) + return page + + +class Command(BaseCommand): + help = "Creates Wagtail redirects from a csv with a 'from' and 'to' " + "column, where entries are URLs with domains." + + def add_arguments(self, parser): + parser.add_argument('file_path', help="Path to a csv file") + parser.add_argument('--dry-run', action='store_true') + parser.add_argument('--from-header', default='from', + help="Header for old-path column if not 'from'") + parser.add_argument('--to-header', default='to', + help="Header for new-path column if not 'to'") + + def handle(self, *args, **options): + file_path = options['file_path'] + dry_run = options['dry_run'] + from_header = options['from_header'] + to_header = options['to_header'] + + updated_count = 0 + created_count = 0 + error_count = 0 + + with open(file_path, 'r') as f: + reader = DictReader(f) + + for row in reader: + old_path = row[from_header].strip() + new_path = row[to_header].strip() + + if not old_path: + continue + + if not new_path: + continue + + # urlparse requires at least a '//' to avoid identifying the + # domain as a path component + if '//' not in old_path: + old_path = '//' + old_path + + netloc = urlparse(old_path).netloc + if not netloc: + print("Line {} - No domain provided: {}".format(reader.line_num, old_path)) + continue + + try: + old_site = Site.objects.get(hostname=netloc) + except Site.DoesNotExist: + print("Line {} - Site does not exist: {}".format(reader.line_num, netloc)) + error_count += 1 + continue + + normalised_path = Redirect.normalise_path(old_path) + + if len(normalised_path) > 255: + print( + "Line {} - 'From' path is too long ({} characters, maximum is 255)".format( + reader.line_num, len(normalised_path)) + ) + error_count += 1 + continue + + # We don't use .get_or_create because we want to support the + # --dry-run flag + + try: + redirect = Redirect.objects.get(site=old_site, old_path=normalised_path) + updated_count += 1 + except Redirect.DoesNotExist: + redirect = Redirect(site=old_site, old_path=normalised_path) + created_count += 1 + + try: + target_page = get_page_from_path(new_path) #optimally, get Page for redirect + redirect.redirect_page = target_page + except Page.DoesNotExist: + print("Line {} - Page does not exist: {}. Linking to URL.".format(reader.line_num, new_path)) + target_url = new_path #else link to URL directly + redirect.redirect_link = target_url + if not dry_run: + redirect.save() + + print("\n") + print("Created: {}".format(created_count)) + print("Updated: {}".format(updated_count)) + print("Errored (so no action taken): {}".format(error_count)) + print("\nDone!")