From 762d683399a1d45896100413a1c93ab4b55e3da1 Mon Sep 17 00:00:00 2001 From: "jacobtm@torchbox.com" Date: Wed, 11 Sep 2019 08:26:17 +0000 Subject: [PATCH 1/4] Add import_redirects command to allow csv redirects import Adapted from Ambition with additions: - links directly to URL if page object cannot be found - wrapped in transactions.atomic() --- .../management/commands/import_redirects.py | 118 ++++++++++++++++++ 1 file changed, 118 insertions(+) create mode 100644 django-verdant/rca/management/commands/import_redirects.py diff --git a/django-verdant/rca/management/commands/import_redirects.py b/django-verdant/rca/management/commands/import_redirects.py new file mode 100644 index 000000000..e209e3f05 --- /dev/null +++ b/django-verdant/rca/management/commands/import_redirects.py @@ -0,0 +1,118 @@ +from csv import DictReader + +from django.core.management.base import BaseCommand, CommandError +from django.db import transaction +from django.utils.six.moves.urllib.parse import urlparse + +from wagtail.wagtailcore.models import Site, Page +from wagtail.wagtailredirects.models import Redirect + + +def get_page_from_path(path): + """ Takes a full url. Roughly reproduces wagtail.wagtailcore.views.serve. + """ + parsed_path = urlparse(path) + try: + site = Site.objects.get(hostname=parsed_path.netloc) + except Site.DoesNotExist: + import pdb; pdb.set_trace() + + path_components = [component for component in parsed_path.path.split('/') + if component] + page = site.root_page + while path_components: + child_slug = path_components[0] + path_components = path_components[1:] + page = page.get_children().get(slug=child_slug) + return page + + +class Command(BaseCommand): + help = "Creates Wagtail redirects from a csv with a 'from' and 'to' " + "column, where entries are URLs with domains." + + def add_arguments(self, parser): + parser.add_argument('file_path', help="Path to a csv file") + parser.add_argument('--dry-run', action='store_true') + parser.add_argument('--from-header', default='from', + help="Header for old-path column if not 'from'") + parser.add_argument('--to-header', default='to', + help="Header for new-path column if not 'to'") + + def handle(self, *args, **options): + file_path = options['file_path'] + dry_run = options['dry_run'] + from_header = options['from_header'] + to_header = options['to_header'] + + updated_count = 0 + created_count = 0 + error_count = 0 + local_netloc = True + + with open(file_path, 'r') as f: + reader = DictReader(f) + + for row in reader: + old_path = row[from_header] + new_path = row[to_header] + + if old_path and new_path: + + # urlparse requires at least a '//' to avoid identifying the + # domain as a path component + if '//' not in old_path: + old_path = '//' + old_path + + netloc = urlparse(old_path).netloc + if not netloc: + print("Line {} - No domain provided: {}".format(reader.line_num, old_path)) + continue + + try: + old_site = Site.objects.get(hostname=netloc) + except Site.DoesNotExist: + print("Line {} - Site does not exist: {}".format(reader.line_num, netloc)) + error_count += 1 + continue + + normalised_path = Redirect.normalise_path(old_path) + + if len(normalised_path) > 255: + print( + "Line {} - 'From' path is too long ({} characters, maximum is 255)".format( + reader.line_num, len(normalised_path)) + ) + error_count += 1 + continue + + # We don't use .get_or_create because we want to support the + # --dry-run flag + with transaction.atomic(): + try: + redirect = Redirect.objects.get(site=old_site, + old_path=normalised_path) + updated_count += 1 + except Redirect.DoesNotExist: + redirect = Redirect(site=old_site, + old_path=normalised_path) + created_count += 1 + + try: + target_page = get_page_from_path(new_path) #optimally, get Page for redirect + if not dry_run: + redirect.redirect_page = target_page + redirect.save() + except Page.DoesNotExist: + print("Line {} - Page does not exist: {}. Linking to URL.".format(reader.line_num, new_path)) + target_url = new_path #else link to URL directly + if not dry_run: + redirect.redirect_link = target_url + redirect.save() + continue + + print("\n") + print("Created: {}".format(created_count)) + print("Updated: {}".format(updated_count)) + print("Errored (so no action taken): {}".format(error_count)) + print("\nDone!") From 1d607de7c762b7e0828d4bc6402bbbad75a6a9fe Mon Sep 17 00:00:00 2001 From: "jacobtm@torchbox.com" Date: Wed, 11 Sep 2019 14:54:32 +0000 Subject: [PATCH 2/4] Removed unused variable local_netloc --- django-verdant/rca/management/commands/import_redirects.py | 1 - 1 file changed, 1 deletion(-) diff --git a/django-verdant/rca/management/commands/import_redirects.py b/django-verdant/rca/management/commands/import_redirects.py index e209e3f05..4d6d8e4db 100644 --- a/django-verdant/rca/management/commands/import_redirects.py +++ b/django-verdant/rca/management/commands/import_redirects.py @@ -48,7 +48,6 @@ def handle(self, *args, **options): updated_count = 0 created_count = 0 error_count = 0 - local_netloc = True with open(file_path, 'r') as f: reader = DictReader(f) From 111d0da38bbb78a9ca4ef01ff288181a647245e5 Mon Sep 17 00:00:00 2001 From: "jacobtm@torchbox.com" Date: Thu, 12 Sep 2019 15:53:30 +0000 Subject: [PATCH 3/4] Remove unnecessary try/catch blocks, debugger reference, transaction.atomic() --- .../management/commands/import_redirects.py | 113 +++++++++--------- 1 file changed, 57 insertions(+), 56 deletions(-) diff --git a/django-verdant/rca/management/commands/import_redirects.py b/django-verdant/rca/management/commands/import_redirects.py index 4d6d8e4db..8359d2530 100644 --- a/django-verdant/rca/management/commands/import_redirects.py +++ b/django-verdant/rca/management/commands/import_redirects.py @@ -12,10 +12,7 @@ def get_page_from_path(path): """ Takes a full url. Roughly reproduces wagtail.wagtailcore.views.serve. """ parsed_path = urlparse(path) - try: - site = Site.objects.get(hostname=parsed_path.netloc) - except Site.DoesNotExist: - import pdb; pdb.set_trace() + site = Site.objects.get(hostname=parsed_path.netloc) path_components = [component for component in parsed_path.path.split('/') if component] @@ -53,62 +50,66 @@ def handle(self, *args, **options): reader = DictReader(f) for row in reader: - old_path = row[from_header] - new_path = row[to_header] - - if old_path and new_path: - - # urlparse requires at least a '//' to avoid identifying the - # domain as a path component - if '//' not in old_path: - old_path = '//' + old_path - - netloc = urlparse(old_path).netloc - if not netloc: - print("Line {} - No domain provided: {}".format(reader.line_num, old_path)) - continue - - try: - old_site = Site.objects.get(hostname=netloc) - except Site.DoesNotExist: - print("Line {} - Site does not exist: {}".format(reader.line_num, netloc)) - error_count += 1 - continue - - normalised_path = Redirect.normalise_path(old_path) - - if len(normalised_path) > 255: - print( - "Line {} - 'From' path is too long ({} characters, maximum is 255)".format( - reader.line_num, len(normalised_path)) + old_path = row[from_header].strip() + new_path = row[to_header].strip() + + if not old_path: + continue + + if not new_path: + continue + + # urlparse requires at least a '//' to avoid identifying the + # domain as a path component + if '//' not in old_path: + old_path = '//' + old_path + + netloc = urlparse(old_path).netloc + if not netloc: + print("Line {} - No domain provided: {}".format(reader.line_num, old_path)) + continue + + try: + old_site = Site.objects.get(hostname=netloc) + except Site.DoesNotExist: + print("Line {} - Site does not exist: {}".format(reader.line_num, netloc)) + error_count += 1 + continue + + normalised_path = Redirect.normalise_path(old_path) + + if len(normalised_path) > 255: + print( + "Line {} - 'From' path is too long ({} characters, maximum is 255)".format( + reader.line_num, len(normalised_path)) ) - error_count += 1 - continue + error_count += 1 + continue # We don't use .get_or_create because we want to support the # --dry-run flag - with transaction.atomic(): - try: - redirect = Redirect.objects.get(site=old_site, - old_path=normalised_path) - updated_count += 1 - except Redirect.DoesNotExist: - redirect = Redirect(site=old_site, - old_path=normalised_path) - created_count += 1 - - try: - target_page = get_page_from_path(new_path) #optimally, get Page for redirect - if not dry_run: - redirect.redirect_page = target_page - redirect.save() - except Page.DoesNotExist: - print("Line {} - Page does not exist: {}. Linking to URL.".format(reader.line_num, new_path)) - target_url = new_path #else link to URL directly - if not dry_run: - redirect.redirect_link = target_url - redirect.save() - continue + + try: + redirect = Redirect.objects.get(site=old_site, + old_path=normalised_path) + updated_count += 1 + except Redirect.DoesNotExist: + redirect = Redirect(site=old_site, + old_path=normalised_path) + created_count += 1 + + try: + target_page = get_page_from_path(new_path) #optimally, get Page for redirect + if not dry_run: + redirect.redirect_page = target_page + redirect.save() + except Page.DoesNotExist: + print("Line {} - Page does not exist: {}. Linking to URL.".format(reader.line_num, new_path)) + target_url = new_path #else link to URL directly + if not dry_run: + redirect.redirect_link = target_url + redirect.save() + continue print("\n") print("Created: {}".format(created_count)) From 2aef907aded003f4ecc326e84528ae50c8edf5a4 Mon Sep 17 00:00:00 2001 From: "jacobtm@torchbox.com" Date: Thu, 12 Sep 2019 16:06:20 +0000 Subject: [PATCH 4/4] Remove unncessary nesting --- .../rca/management/commands/import_redirects.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/django-verdant/rca/management/commands/import_redirects.py b/django-verdant/rca/management/commands/import_redirects.py index 8359d2530..b713352ad 100644 --- a/django-verdant/rca/management/commands/import_redirects.py +++ b/django-verdant/rca/management/commands/import_redirects.py @@ -90,26 +90,21 @@ def handle(self, *args, **options): # --dry-run flag try: - redirect = Redirect.objects.get(site=old_site, - old_path=normalised_path) + redirect = Redirect.objects.get(site=old_site, old_path=normalised_path) updated_count += 1 except Redirect.DoesNotExist: - redirect = Redirect(site=old_site, - old_path=normalised_path) + redirect = Redirect(site=old_site, old_path=normalised_path) created_count += 1 try: target_page = get_page_from_path(new_path) #optimally, get Page for redirect - if not dry_run: - redirect.redirect_page = target_page - redirect.save() + redirect.redirect_page = target_page except Page.DoesNotExist: print("Line {} - Page does not exist: {}. Linking to URL.".format(reader.line_num, new_path)) target_url = new_path #else link to URL directly - if not dry_run: - redirect.redirect_link = target_url - redirect.save() - continue + redirect.redirect_link = target_url + if not dry_run: + redirect.save() print("\n") print("Created: {}".format(created_count))