From 57040f15516176a371a2228c89540f05ff675322 Mon Sep 17 00:00:00 2001 From: Olivier Leger Date: Thu, 19 Oct 2023 14:56:07 -0400 Subject: [PATCH 1/3] Avoid not edited instance --- .../soft_delete_orphan_attachments.py | 71 ++++++++++++++++++- 1 file changed, 69 insertions(+), 2 deletions(-) diff --git a/onadata/apps/logger/management/commands/soft_delete_orphan_attachments.py b/onadata/apps/logger/management/commands/soft_delete_orphan_attachments.py index 586bacfe0..c414010a8 100644 --- a/onadata/apps/logger/management/commands/soft_delete_orphan_attachments.py +++ b/onadata/apps/logger/management/commands/soft_delete_orphan_attachments.py @@ -8,6 +8,7 @@ ) from onadata.apps.logger.signals import pre_delete_attachment from onadata.libs.utils.logger_tools import get_soft_deleted_attachments +from onadata.apps.viewer.models.parsed_instance import datetime_from_str class Command(BaseCommand): @@ -19,15 +20,34 @@ def add_arguments(self, parser): '--chunks', type=int, default=2000, - help="Number of records to process per query" + help='Number of records to process per query' + ) + + parser.add_argument( + '--start-id', + type=int, + default=0, + help='Instance ID to start from' + ) + + parser.add_argument( + '--start-date', + type=str, + default=None, + help='Starting date to start from. Format: yyyy-mm-aa.' ) def handle(self, *args, **kwargs): chunks = kwargs['chunks'] verbosity = kwargs['verbosity'] + start_id = kwargs['start_id'] + start_date = kwargs['start_date'] queryset = Attachment.objects.values_list('instance_id', flat=True).distinct() + if start_id: + queryset = queryset.filter(instance_id__gte=start_id) + if verbosity > 1: self.stdout.write( f'Calculating number of instance with attachments…' @@ -43,11 +63,58 @@ def handle(self, *args, **kwargs): self.stdout.write( f'Processing Instance object #{instance_id}{message}…' ) - soft_deleted_attachments = get_soft_deleted_attachments(instance) + + if start_date and instance.date_created < datetime_from_str( + f'{start_date}T00:00:00 +0000' + ): + if verbosity > 1: + message = '' if verbosity <= 1 else f' - {cpt}/{instances_count}' + self.stdout.write( + f'\tSkip Instance object #{instance_id}{message}. Too old' + ) + cpt += 1 + continue + + if not self._has_instance_been_edited(instance): + if verbosity > 1: + message = '' if verbosity <= 1 else f' - {cpt}/{instances_count}' + self.stdout.write( + f'\tSkip Instance object #{instance_id}{message}. Not edited' + ) + cpt += 1 + continue + + try: + soft_deleted_attachments = get_soft_deleted_attachments(instance) + except Exception as e: + cpt += 1 + if verbosity > 0: + self.stderr.write( + f'\tError for Instance object #{instance_id}: {str(e)}' + ) + continue + for soft_deleted_attachment in soft_deleted_attachments: pre_delete_attachment( soft_deleted_attachment, only_update_counters=True ) + if verbosity > 1: + message = '' if verbosity <= 1 else f' - {cpt}/{instances_count}' + self.stdout.write( + f'\tInstance object #{instance_id}{message} updated!' + ) + cpt += 1 cpt += 1 self.stdout.write('Done!') + + def _has_instance_been_edited(self, instance): + """ + Consider instance as edited if it modification date is more or less 10 + seconds apart + """ + date_created_ts = instance.date_created.timestamp() + date_modified_ts = instance.date_modified.timestamp() + return not ( + date_created_ts <= date_modified_ts <= date_created_ts + 10 + ) From f2328a51a7ebc8da2ab68c4aef5ef52183868c6a Mon Sep 17 00:00:00 2001 From: Olivier Leger Date: Mon, 23 Oct 2023 10:28:33 -0400 Subject: [PATCH 2/3] Use ORM to filter Instance objects --- .../soft_delete_orphan_attachments.py | 89 ++++++++++--------- onadata/apps/logger/models/instance.py | 2 +- 2 files changed, 48 insertions(+), 43 deletions(-) diff --git a/onadata/apps/logger/management/commands/soft_delete_orphan_attachments.py b/onadata/apps/logger/management/commands/soft_delete_orphan_attachments.py index c414010a8..c2520e396 100644 --- a/onadata/apps/logger/management/commands/soft_delete_orphan_attachments.py +++ b/onadata/apps/logger/management/commands/soft_delete_orphan_attachments.py @@ -1,6 +1,9 @@ from __future__ import annotations +from datetime import timedelta + from django.core.management.base import BaseCommand +from django.db.models import F from onadata.apps.logger.models import ( Attachment, @@ -37,61 +40,75 @@ def add_arguments(self, parser): help='Starting date to start from. Format: yyyy-mm-aa.' ) + parser.add_argument( + '--not-edited-offset', + type=int, + default=10, + help=( + 'Offset in seconds between creation date and modification date ' + 'to consider submissions as not edited' + ) + ) + def handle(self, *args, **kwargs): chunks = kwargs['chunks'] verbosity = kwargs['verbosity'] start_id = kwargs['start_id'] start_date = kwargs['start_date'] + offset = kwargs['not_edited_offset'] - queryset = Attachment.objects.values_list('instance_id', flat=True).distinct() + self.stdout.write( + '⚠ Warning! This management can take a while (i.e. several days) ' + 'to run on big databases' + ) + + instance_ids = Attachment.objects.values_list('instance_id', flat=True).distinct() + + if start_id: + instance_ids = instance_ids.filter(instance_id__gte=start_id) + + queryset = ( + Instance.objects.only('xml', 'xform') + .filter(pk__in=instance_ids) + .exclude( + date_modified__lt=F('date_created') + + timedelta(seconds=offset), + ) + ) if start_id: - queryset = queryset.filter(instance_id__gte=start_id) + queryset = queryset.filter(pk__gte=start_id) + + if start_date: + queryset = queryset.filter(date_created__date__gte=start_date) if verbosity > 1: self.stdout.write( - f'Calculating number of instance with attachments…' + f'Calculating number of Instance objects with attachments…' ) instances_count = queryset.count() cpt = 1 + queryset = queryset.order_by('pk') - for instance_id in queryset.iterator(chunk_size=chunks): - instance = Instance.objects.get(pk=instance_id) + if verbosity > 1: + self.stdout.write( + f'Retrieving Instance objects…' + ) + + for instance in queryset.iterator(chunk_size=chunks): if verbosity > 0: message = '' if verbosity <= 1 else f' - {cpt}/{instances_count}' self.stdout.write( - f'Processing Instance object #{instance_id}{message}…' + f'Processing Instance object #{instance.pk}{message}…' ) - if start_date and instance.date_created < datetime_from_str( - f'{start_date}T00:00:00 +0000' - ): - if verbosity > 1: - message = '' if verbosity <= 1 else f' - {cpt}/{instances_count}' - self.stdout.write( - f'\tSkip Instance object #{instance_id}{message}. Too old' - ) - cpt += 1 - continue - - if not self._has_instance_been_edited(instance): - if verbosity > 1: - message = '' if verbosity <= 1 else f' - {cpt}/{instances_count}' - self.stdout.write( - f'\tSkip Instance object #{instance_id}{message}. Not edited' - ) - cpt += 1 - continue - try: soft_deleted_attachments = get_soft_deleted_attachments(instance) except Exception as e: cpt += 1 if verbosity > 0: - self.stderr.write( - f'\tError for Instance object #{instance_id}: {str(e)}' - ) + self.stderr.write(f'\tError: {str(e)}') continue for soft_deleted_attachment in soft_deleted_attachments: @@ -101,20 +118,8 @@ def handle(self, *args, **kwargs): if verbosity > 1: message = '' if verbosity <= 1 else f' - {cpt}/{instances_count}' self.stdout.write( - f'\tInstance object #{instance_id}{message} updated!' + f'\tInstance object #{instance.pk}{message} updated!' ) cpt += 1 - cpt += 1 self.stdout.write('Done!') - - def _has_instance_been_edited(self, instance): - """ - Consider instance as edited if it modification date is more or less 10 - seconds apart - """ - date_created_ts = instance.date_created.timestamp() - date_modified_ts = instance.date_modified.timestamp() - return not ( - date_created_ts <= date_modified_ts <= date_created_ts + 10 - ) diff --git a/onadata/apps/logger/models/instance.py b/onadata/apps/logger/models/instance.py index 1ac617449..ee6ef5ad6 100644 --- a/onadata/apps/logger/models/instance.py +++ b/onadata/apps/logger/models/instance.py @@ -252,7 +252,7 @@ class Instance(models.Model): default='submitted_via_web') uuid = models.CharField(max_length=249, default='', db_index=True) - # store an geographic objects associated with this instance + # store a geographic objects associated with this instance geom = models.GeometryCollectionField(null=True) tags = TaggableManager() From 8842358d7ff9d6c2de86ca15abb7ea8c68e9aefc Mon Sep 17 00:00:00 2001 From: Olivier Leger Date: Tue, 24 Oct 2023 10:56:48 -0400 Subject: [PATCH 3/3] Skip redundant queries to DB --- .../commands/soft_delete_orphan_attachments.py | 7 +++++-- onadata/apps/logger/models/xform.py | 14 ++++++++++---- onadata/apps/logger/xform_instance_parser.py | 3 +-- 3 files changed, 16 insertions(+), 8 deletions(-) diff --git a/onadata/apps/logger/management/commands/soft_delete_orphan_attachments.py b/onadata/apps/logger/management/commands/soft_delete_orphan_attachments.py index c2520e396..dfd5a73c0 100644 --- a/onadata/apps/logger/management/commands/soft_delete_orphan_attachments.py +++ b/onadata/apps/logger/management/commands/soft_delete_orphan_attachments.py @@ -11,7 +11,6 @@ ) from onadata.apps.logger.signals import pre_delete_attachment from onadata.libs.utils.logger_tools import get_soft_deleted_attachments -from onadata.apps.viewer.models.parsed_instance import datetime_from_str class Command(BaseCommand): @@ -62,7 +61,9 @@ def handle(self, *args, **kwargs): 'to run on big databases' ) - instance_ids = Attachment.objects.values_list('instance_id', flat=True).distinct() + instance_ids = Attachment.objects.values_list( + 'instance_id', flat=True + ).distinct() if start_id: instance_ids = instance_ids.filter(instance_id__gte=start_id) @@ -112,6 +113,8 @@ def handle(self, *args, **kwargs): continue for soft_deleted_attachment in soft_deleted_attachments: + # Avoid fetching Instance object once again + soft_deleted_attachment.instance = instance pre_delete_attachment( soft_deleted_attachment, only_update_counters=True ) diff --git a/onadata/apps/logger/models/xform.py b/onadata/apps/logger/models/xform.py index 146a31e9a..84995d3ec 100644 --- a/onadata/apps/logger/models/xform.py +++ b/onadata/apps/logger/models/xform.py @@ -2,6 +2,7 @@ import json import os import re +from copy import deepcopy from io import BytesIO from xml.sax.saxutils import escape as xml_escape @@ -129,10 +130,15 @@ def url(self): } ) - def data_dictionary(self): - from onadata.apps.viewer.models.data_dictionary import\ - DataDictionary - return DataDictionary.all_objects.get(pk=self.pk) + def data_dictionary(self, use_cache: bool = False): + from onadata.apps.viewer.models.data_dictionary import DataDictionary + + if not use_cache: + return DataDictionary.all_objects.get(pk=self.pk) + + xform_dict = deepcopy(self.__dict__) + xform_dict.pop('_state', None) + return DataDictionary(**xform_dict) @property def has_instances_with_geopoints(self): diff --git a/onadata/apps/logger/xform_instance_parser.py b/onadata/apps/logger/xform_instance_parser.py index cea69d0ba..a6c21c8de 100644 --- a/onadata/apps/logger/xform_instance_parser.py +++ b/onadata/apps/logger/xform_instance_parser.py @@ -366,10 +366,9 @@ def get_xform_media_question_xpaths( xform: 'onadata.apps.logger.models.XForm', ) -> list: logger = logging.getLogger('console_logger') - parser = XFormInstanceParser(xform.xml, xform.data_dictionary()) + parser = XFormInstanceParser(xform.xml, xform.data_dictionary(use_cache=True)) all_attributes = _get_all_attributes(parser.get_root_node()) media_field_xpaths = [] - # This code expects that the attributes from Enketo Express are **always** # sent in the same order. # For example: