diff --git a/onadata/apps/logger/management/commands/soft_delete_orphan_attachments.py b/onadata/apps/logger/management/commands/soft_delete_orphan_attachments.py index 586bacfe0..dfd5a73c0 100644 --- a/onadata/apps/logger/management/commands/soft_delete_orphan_attachments.py +++ b/onadata/apps/logger/management/commands/soft_delete_orphan_attachments.py @@ -1,6 +1,9 @@ from __future__ import annotations +from datetime import timedelta + from django.core.management.base import BaseCommand +from django.db.models import F from onadata.apps.logger.models import ( Attachment, @@ -19,35 +22,107 @@ def add_arguments(self, parser): '--chunks', type=int, default=2000, - help="Number of records to process per query" + help='Number of records to process per query' + ) + + parser.add_argument( + '--start-id', + type=int, + default=0, + help='Instance ID to start from' + ) + + parser.add_argument( + '--start-date', + type=str, + default=None, + help='Starting date to start from. Format: yyyy-mm-aa.' + ) + + parser.add_argument( + '--not-edited-offset', + type=int, + default=10, + help=( + 'Offset in seconds between creation date and modification date ' + 'to consider submissions as not edited' + ) ) def handle(self, *args, **kwargs): chunks = kwargs['chunks'] verbosity = kwargs['verbosity'] + start_id = kwargs['start_id'] + start_date = kwargs['start_date'] + offset = kwargs['not_edited_offset'] + + self.stdout.write( + '⚠ Warning! This management can take a while (i.e. several days) ' + 'to run on big databases' + ) + + instance_ids = Attachment.objects.values_list( + 'instance_id', flat=True + ).distinct() - queryset = Attachment.objects.values_list('instance_id', flat=True).distinct() + if start_id: + instance_ids = instance_ids.filter(instance_id__gte=start_id) + + queryset = ( + Instance.objects.only('xml', 'xform') + .filter(pk__in=instance_ids) + .exclude( + date_modified__lt=F('date_created') + + timedelta(seconds=offset), + ) + ) + + if start_id: + queryset = queryset.filter(pk__gte=start_id) + + if start_date: + queryset = queryset.filter(date_created__date__gte=start_date) if verbosity > 1: self.stdout.write( - f'Calculating number of instance with attachments…' + f'Calculating number of Instance objects with attachments…' ) instances_count = queryset.count() cpt = 1 + queryset = queryset.order_by('pk') + + if verbosity > 1: + self.stdout.write( + f'Retrieving Instance objects…' + ) - for instance_id in queryset.iterator(chunk_size=chunks): - instance = Instance.objects.get(pk=instance_id) + for instance in queryset.iterator(chunk_size=chunks): if verbosity > 0: message = '' if verbosity <= 1 else f' - {cpt}/{instances_count}' self.stdout.write( - f'Processing Instance object #{instance_id}{message}…' + f'Processing Instance object #{instance.pk}{message}…' ) - soft_deleted_attachments = get_soft_deleted_attachments(instance) + + try: + soft_deleted_attachments = get_soft_deleted_attachments(instance) + except Exception as e: + cpt += 1 + if verbosity > 0: + self.stderr.write(f'\tError: {str(e)}') + continue + for soft_deleted_attachment in soft_deleted_attachments: + # Avoid fetching Instance object once again + soft_deleted_attachment.instance = instance pre_delete_attachment( soft_deleted_attachment, only_update_counters=True ) - + if verbosity > 1: + message = '' if verbosity <= 1 else f' - {cpt}/{instances_count}' + self.stdout.write( + f'\tInstance object #{instance.pk}{message} updated!' + ) cpt += 1 + self.stdout.write('Done!') diff --git a/onadata/apps/logger/models/instance.py b/onadata/apps/logger/models/instance.py index 1ac617449..ee6ef5ad6 100644 --- a/onadata/apps/logger/models/instance.py +++ b/onadata/apps/logger/models/instance.py @@ -252,7 +252,7 @@ class Instance(models.Model): default='submitted_via_web') uuid = models.CharField(max_length=249, default='', db_index=True) - # store an geographic objects associated with this instance + # store a geographic objects associated with this instance geom = models.GeometryCollectionField(null=True) tags = TaggableManager() diff --git a/onadata/apps/logger/models/xform.py b/onadata/apps/logger/models/xform.py index 146a31e9a..84995d3ec 100644 --- a/onadata/apps/logger/models/xform.py +++ b/onadata/apps/logger/models/xform.py @@ -2,6 +2,7 @@ import json import os import re +from copy import deepcopy from io import BytesIO from xml.sax.saxutils import escape as xml_escape @@ -129,10 +130,15 @@ def url(self): } ) - def data_dictionary(self): - from onadata.apps.viewer.models.data_dictionary import\ - DataDictionary - return DataDictionary.all_objects.get(pk=self.pk) + def data_dictionary(self, use_cache: bool = False): + from onadata.apps.viewer.models.data_dictionary import DataDictionary + + if not use_cache: + return DataDictionary.all_objects.get(pk=self.pk) + + xform_dict = deepcopy(self.__dict__) + xform_dict.pop('_state', None) + return DataDictionary(**xform_dict) @property def has_instances_with_geopoints(self): diff --git a/onadata/apps/logger/xform_instance_parser.py b/onadata/apps/logger/xform_instance_parser.py index cea69d0ba..a6c21c8de 100644 --- a/onadata/apps/logger/xform_instance_parser.py +++ b/onadata/apps/logger/xform_instance_parser.py @@ -366,10 +366,9 @@ def get_xform_media_question_xpaths( xform: 'onadata.apps.logger.models.XForm', ) -> list: logger = logging.getLogger('console_logger') - parser = XFormInstanceParser(xform.xml, xform.data_dictionary()) + parser = XFormInstanceParser(xform.xml, xform.data_dictionary(use_cache=True)) all_attributes = _get_all_attributes(parser.get_root_node()) media_field_xpaths = [] - # This code expects that the attributes from Enketo Express are **always** # sent in the same order. # For example: