Skip to content

Commit

Permalink
Merge pull request #902 from kobotoolbox/improve-soft-delete-orphan-m…
Browse files Browse the repository at this point in the history
…ngnt-command

Improve `soft_delete_orphan_attachments` management command speed
  • Loading branch information
noliveleger committed Oct 26, 2023
2 parents 1c41cea + 8842358 commit 0c40234
Show file tree
Hide file tree
Showing 4 changed files with 95 additions and 15 deletions.
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
from __future__ import annotations

from datetime import timedelta

from django.core.management.base import BaseCommand
from django.db.models import F

from onadata.apps.logger.models import (
Attachment,
Expand All @@ -19,35 +22,107 @@ def add_arguments(self, parser):
'--chunks',
type=int,
default=2000,
help="Number of records to process per query"
help='Number of records to process per query'
)

parser.add_argument(
'--start-id',
type=int,
default=0,
help='Instance ID to start from'
)

parser.add_argument(
'--start-date',
type=str,
default=None,
help='Starting date to start from. Format: yyyy-mm-aa.'
)

parser.add_argument(
'--not-edited-offset',
type=int,
default=10,
help=(
'Offset in seconds between creation date and modification date '
'to consider submissions as not edited'
)
)

def handle(self, *args, **kwargs):
chunks = kwargs['chunks']
verbosity = kwargs['verbosity']
start_id = kwargs['start_id']
start_date = kwargs['start_date']
offset = kwargs['not_edited_offset']

self.stdout.write(
'⚠ Warning! This management can take a while (i.e. several days) '
'to run on big databases'
)

instance_ids = Attachment.objects.values_list(
'instance_id', flat=True
).distinct()

queryset = Attachment.objects.values_list('instance_id', flat=True).distinct()
if start_id:
instance_ids = instance_ids.filter(instance_id__gte=start_id)

queryset = (
Instance.objects.only('xml', 'xform')
.filter(pk__in=instance_ids)
.exclude(
date_modified__lt=F('date_created')
+ timedelta(seconds=offset),
)
)

if start_id:
queryset = queryset.filter(pk__gte=start_id)

if start_date:
queryset = queryset.filter(date_created__date__gte=start_date)

if verbosity > 1:
self.stdout.write(
f'Calculating number of instance with attachments…'
f'Calculating number of Instance objects with attachments…'
)
instances_count = queryset.count()

cpt = 1
queryset = queryset.order_by('pk')

if verbosity > 1:
self.stdout.write(
f'Retrieving Instance objects…'
)

for instance_id in queryset.iterator(chunk_size=chunks):
instance = Instance.objects.get(pk=instance_id)
for instance in queryset.iterator(chunk_size=chunks):
if verbosity > 0:
message = '' if verbosity <= 1 else f' - {cpt}/{instances_count}'
self.stdout.write(
f'Processing Instance object #{instance_id}{message}…'
f'Processing Instance object #{instance.pk}{message}…'
)
soft_deleted_attachments = get_soft_deleted_attachments(instance)

try:
soft_deleted_attachments = get_soft_deleted_attachments(instance)
except Exception as e:
cpt += 1
if verbosity > 0:
self.stderr.write(f'\tError: {str(e)}')
continue

for soft_deleted_attachment in soft_deleted_attachments:
# Avoid fetching Instance object once again
soft_deleted_attachment.instance = instance
pre_delete_attachment(
soft_deleted_attachment, only_update_counters=True
)

if verbosity > 1:
message = '' if verbosity <= 1 else f' - {cpt}/{instances_count}'
self.stdout.write(
f'\tInstance object #{instance.pk}{message} updated!'
)
cpt += 1

self.stdout.write('Done!')
2 changes: 1 addition & 1 deletion onadata/apps/logger/models/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -252,7 +252,7 @@ class Instance(models.Model):
default='submitted_via_web')
uuid = models.CharField(max_length=249, default='', db_index=True)

# store an geographic objects associated with this instance
# store a geographic objects associated with this instance
geom = models.GeometryCollectionField(null=True)

tags = TaggableManager()
Expand Down
14 changes: 10 additions & 4 deletions onadata/apps/logger/models/xform.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import json
import os
import re
from copy import deepcopy
from io import BytesIO
from xml.sax.saxutils import escape as xml_escape

Expand Down Expand Up @@ -129,10 +130,15 @@ def url(self):
}
)

def data_dictionary(self):
from onadata.apps.viewer.models.data_dictionary import\
DataDictionary
return DataDictionary.all_objects.get(pk=self.pk)
def data_dictionary(self, use_cache: bool = False):
from onadata.apps.viewer.models.data_dictionary import DataDictionary

if not use_cache:
return DataDictionary.all_objects.get(pk=self.pk)

xform_dict = deepcopy(self.__dict__)
xform_dict.pop('_state', None)
return DataDictionary(**xform_dict)

@property
def has_instances_with_geopoints(self):
Expand Down
3 changes: 1 addition & 2 deletions onadata/apps/logger/xform_instance_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -366,10 +366,9 @@ def get_xform_media_question_xpaths(
xform: 'onadata.apps.logger.models.XForm',
) -> list:
logger = logging.getLogger('console_logger')
parser = XFormInstanceParser(xform.xml, xform.data_dictionary())
parser = XFormInstanceParser(xform.xml, xform.data_dictionary(use_cache=True))
all_attributes = _get_all_attributes(parser.get_root_node())
media_field_xpaths = []

# This code expects that the attributes from Enketo Express are **always**
# sent in the same order.
# For example:
Expand Down

0 comments on commit 0c40234

Please sign in to comment.