Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handle a question whose type changes but name doesn't. #187

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
18 commits
Select commit Hold shift + click to select a range
7fb1ad9
Creates a unique_name when field is added to FormVersion
noliveleger Dec 13, 2018
c15fc18
Merge branch '182-shifted-value-with-deleted-multiple-options' into 1…
noliveleger Dec 13, 2018
5af7218
Exports supports data for same questions with different types
noliveleger Dec 13, 2018
9652455
Changed unittest to match new representation of FormField
noliveleger Dec 14, 2018
5953cde
Fixed stats calculation when multiple versions contains same question…
noliveleger Dec 14, 2018
cc58e06
Fixed typo
noliveleger Dec 14, 2018
f4809a4
Used property 'contextual_name' instead of 'name'
noliveleger Dec 14, 2018
dc69313
Changed unittests for multiple versions with questions with same name…
noliveleger Dec 14, 2018
3985489
Removed useless try/except when trying parsing values of NumField
noliveleger Dec 14, 2018
a042146
Merge branch 'master' into 151-question-with-different-types
noliveleger Dec 17, 2018
3b0db4d
Merge branch 'master' into 151-question-with-different-types
noliveleger Dec 18, 2018
dc05c1d
Applied requested changes for PR #187
noliveleger Dec 18, 2018
50b5a2e
Changed FormDataDef representation string
noliveleger Dec 18, 2018
16a896a
Applied PEP-8 guidance
noliveleger Dec 19, 2018
d1c5429
Removed useless import
noliveleger Dec 19, 2018
f0ddf7d
Applied logic of '_calculate_stats()' to '_disaggregate_stats()' in o…
noliveleger Dec 20, 2018
bd25c51
Merge 'master' branch into 151-question-with-different-types
noliveleger Jul 8, 2019
785ebda
Restored try/except to handle unexpected values as blank when calcula…
noliveleger Jul 8, 2019
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 26 additions & 16 deletions src/formpack/pack.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,6 @@ def version_id_keys(self, _versions=None):
_id_keys.append(_id_key)
return _id_keys


@property
def available_translations(self):
translations = set()
Expand Down Expand Up @@ -191,25 +190,30 @@ def summr(v):
return ''.join(out)

@staticmethod
def _combine_field_choices(old_field, new_field):
def _combine_field_choices(older_version_field, current_field):
"""
Update `new_field.choice` so that it contains everything from
`old_field.choice`. In the event of a conflict, `new_field.choice`
Updates `current_field.choice` so that it contains everything from
`older_version_field.choice`. In the event of a conflict, `current_field.choice`
wins. If either field does not have a `choice` attribute, do
nothing

:param old_field: FormField
:param new_field: FormField
:param older_version_field: FormField
:param current_field: FormField
:return: FormField. Updated new_field
"""

try:
old_choice = old_field.choice
new_choice = new_field.choice
new_field.merge_choice(old_choice)
if hasattr(current_field, "merge_choice"):
jnm marked this conversation as resolved.
Show resolved Hide resolved
older_version_choice = older_version_field.choice
current_field.merge_choice(older_version_choice)
except AttributeError:
pass

return new_field
return current_field

@staticmethod
def _do_fields_match(older_versioned_field, current_field):
return older_versioned_field.signature == current_field.signature

def get_fields_for_versions(self, versions=-1, data_types=None):

Expand All @@ -233,7 +237,6 @@ def get_fields_for_versions(self, versions=-1, data_types=None):
if isinstance(data_types, str_types):
data_types = [data_types]


# tmp2 is a 2 dimensions list of `field`.
# First dimension is the position of fields where they should be in the latest version
# Second dimension is their position in the stack at the same position.
Expand Down Expand Up @@ -272,16 +275,23 @@ def get_fields_for_versions(self, versions=-1, data_types=None):
for section_name, section in version.sections.items():
for field_name, field_object in section.fields.items():
if not isinstance(field_object, CopyField):
add_field = True
if field_name in positions:
position = positions[field_name]
latest_field_object = tmp2d[position[0]][position[1]]
# Because versions_desc are ordered from latest to oldest,
# we use current field object as the old one and the one already
# in position as the latest one.
new_object = self._combine_field_choices(
field_object, latest_field_object)
tmp2d[position[0]][position[1]] = new_object
else:

if self._do_fields_match(field_object, latest_field_object):
new_object = self._combine_field_choices(
field_object, latest_field_object)
tmp2d[position[0]][position[1]] = new_object
add_field = False
else:
field_object.use_unique_name = True

if add_field:
try:
current_index_list = tmp2d[index]
current_index_list.append(field_object)
Expand All @@ -292,7 +302,7 @@ def get_fields_for_versions(self, versions=-1, data_types=None):
# it can happen when current version has more items than newest one.
index = len(tmp2d) - 1

positions[field_name] = (index, len(tmp2d[index]) - 1)
positions[field_object.contextual_name] = (index, len(tmp2d[index]) - 1)

index += 1

Expand Down
53 changes: 43 additions & 10 deletions src/formpack/reporting/autoreport.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,11 +49,21 @@ def _get_version_id_from_submission(self, submission):

def _calculate_stats(self, submissions, fields, versions, lang):

metrics = {field.name: Counter() for field in fields}
metrics = {field.contextual_name: Counter() for field in fields}

submissions_count = 0
submission_counts_by_version = Counter()

# When form contains questions with the same name with different types,
# Older found versions are pushed at the end of the list `fields`
# Because we want to match submission values with fields, we need to try
# to match with older version first.
# For example: Form contains two versions with one question.
# `reversed_fields` look this:
# [<FormField type="text" contextual_name="question_text_v123456">,
# <FormField type="integer" contextual_name="question">]
reversed_fields = list(reversed(fields))

for entry in submissions:

version_id = self._get_version_id_from_submission(entry)
Expand All @@ -62,14 +72,37 @@ def _calculate_stats(self, submissions, fields, versions, lang):

submissions_count += 1
submission_counts_by_version[version_id] += 1
fields_to_skip = []

# TODO: do we really need FormSubmission ?
entry = FormSubmission(entry).data
for field in fields:
if field.has_stats:
counter = metrics[field.name]

for field in reversed_fields:
if field.has_stats and field.name not in fields_to_skip:
counter = metrics[field.contextual_name]
raw_value = entry.get(field.path)
field_contextual_name = "{}_{}_{}".format(
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(self: revise this) i guess this is the expected contextual name, used for comparison not assignment?
possible to use the create_unique_name() method to avoid hard-coding this here?
https://github.com/kobotoolbox/formpack/pull/187/files#diff-7845025cfa4cd9a06d7a4399e1ba9c0bR58

or, maybe if field.contextual_name.startswith("{}_{}_v".format( below could be changed to something like if field.use_unique_name?

Copy link
Contributor Author

@noliveleger noliveleger Dec 18, 2018

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@jnm: I now remember why I did this comparison this way.

We have

                        if field.contextual_name.startswith("{}_{}_v".format(
                            field.name,
                            field.data_type
                        )):
                            # We have a match, we won't evaluate other fields with same name
                            # for this submission.
                            if field.contextual_name == field_contextual_name:
                                fields_to_skip.append(field.name)
                            else:
                                continue

Let's explain:

All fields that are added by get_fields_for_versions (because another type has been detected in the latest version) match the pattern name_data_type_version_id.

  • Trivial case: Pattern doesn't match. Data belongs to field. Skip first if statement.
  • Other case: We have a match with this pattern
    1. version_id is the same. Data belongs to this field. All other fields with same name will be skipped for this submission.
    2. version_id is not the same, we want to iterate over loop immediately because the data
      should be appended to the corresponding field (or column in the export) only - which is a further occurrence of the loop.

The {}_{}_v pattern is used to avoid matching another field name.
For example:
- Question 1: restaurant
- Question 2: restaurant_text

Adding the "_v" in the pattern decreases the risk of matching the wrong field.
Obviously if all these rules are true:

  • Form has several versions
  • Form has a question named restaurant has multiple types among versions
  • One of its types is text
  • Form has a question name restaurant_text_v

Export will have data shifted again. The comparison would be a regex to decrease the risk a little bit more.
Something like _v[\d\w]{21,}. What do you think?
Having this said: Your suggestion to use create_unique_name is really good idea.

About the idea of maybe if field.contextual_name.startswith("{}_{}_v".format( below could be changed to something like if field.use_unique_name?
Actually it can't.

In case the form has more than 3 versions and each version contains a question with the same name but 3 different types.
In that case fields should be:

[<FormField type="text" contextual_name="question_text_v123456">,
  <FormField type="integer" contextual_name="question_integer_v234567">,
  <FormField type="date_time" contextual_name="question">]

Let's say, we have 1 submission for each version where value of the question question would be

  • Submission 1: answer
  • Submission 2: 1
  • Submission 3: 2018-12-18 00:00:00

Using field.use_unique_name, submission 1 and 2 would be matched with field question_text_v123456 because both fields question_text_v123456 and question_integer_v234567 have use_unique_name property equals True. Using the string comparison submission 2 can't be matched with question_text_v123456 (because its version_id is not the same)

field.name,
field.data_type,
version_id
)

if raw_value is not None:
# Reminder `field.contextual_name` equals `<name>_<type>_<version_uid>`,
# If `field.contextual_name` matches this pattern, it's only valid for submissions of
# the same `<version_uid>`.
# Thanks to `reversed_fields`, we compared those fields before the ones of the latest version
if field.contextual_name.startswith("{}_{}_v".format(
field.name,
field.data_type
)):
# We have a match, we won't evaluate other fields with same name
# for this submission.
if field.contextual_name == field_contextual_name:
fields_to_skip.append(field.name)
else:
continue

values = list(field.parse_values(raw_value))
counter.update(values)
counter['__submissions__'] += 1
Expand All @@ -80,7 +113,7 @@ def stats_generator():
for field in fields:
yield (field,
field.get_labels(lang)[0],
field.get_stats(metrics[field.name], lang=lang))
field.get_stats(metrics[field.contextual_name], lang=lang))

return AutoReportStats(self, stats_generator(), submissions_count,
submission_counts_by_version)
Expand Down Expand Up @@ -110,7 +143,7 @@ def _disaggregate_stats(self, submissions, fields, versions, lang, split_by_fiel
# field_name2...},
# ...}
#
metrics = {f.name: defaultdict(Counter) for f in fields}
metrics = {f.contextual_name: defaultdict(Counter) for f in fields}

for sbmssn in submissions:

Expand Down Expand Up @@ -142,7 +175,7 @@ def _disaggregate_stats(self, submissions, fields, versions, lang, split_by_fiel
else:
values = (None,)

value_metrics = metrics[field.name]
value_metrics = metrics[field.contextual_name]

for value in values:
counters = value_metrics[value]
Expand Down Expand Up @@ -177,7 +210,7 @@ def _disaggregate_stats(self, submissions, fields, versions, lang, split_by_fiel

def stats_generator():
for field in fields:
stats = field.get_disaggregated_stats(metrics[field.name], lang=lang,
stats = field.get_disaggregated_stats(metrics[field.contextual_name], lang=lang,
top_splitters=top_splitters)
yield (field, field.get_labels(lang)[0], stats)

Expand All @@ -194,11 +227,11 @@ def get_stats(self, submissions, fields=(), lang=UNSPECIFIED_TRANSLATION, split_
fields = all_fields
else:
fields.add(split_by)
fields = [field for field in all_fields if field.name in fields]
fields = [field for field in all_fields if field.contextual_name in fields]

if split_by:
try:
split_by_field = next(f for f in fields if f.name == split_by)
split_by_field = next(f for f in fields if f.contextual_name == split_by)
except StopIteration:
raise ValueError('No field matching name "%s" '
'for split_by' % split_by)
Expand Down
5 changes: 2 additions & 3 deletions src/formpack/reporting/export.py
Original file line number Diff line number Diff line change
Expand Up @@ -170,7 +170,7 @@ def get_fields_labels_tags_for_all_versions(self,

for field in all_fields:
section_fields.setdefault(field.section.name, []).append(
(field.name, field)
(field.contextual_name, field)
)
section_labels.setdefault(field.section.name, []).append(
field.get_labels(lang, group_sep,
Expand All @@ -197,7 +197,6 @@ def get_fields_labels_tags_for_all_versions(self,
auto_field_names.append(
"_submission_{}".format(copy_field))


# Flatten field labels and names. Indeed, field.get_labels()
# and self.names return a list because a multiple select field can
# have several values. We needed them grouped to insert them at the
Expand All @@ -218,7 +217,7 @@ def get_fields_labels_tags_for_all_versions(self,
# labels, add the tags once for each label
tags.extend(
[flatten_tag_list(field.tags, tag_cols_and_seps)] *
len(field.value_names)
len(field.value_names)
jnm marked this conversation as resolved.
Show resolved Hide resolved
)

names = [name for name_list in name_lists for name in name_list]
Expand Down
22 changes: 18 additions & 4 deletions src/formpack/schema/datadef.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,15 +23,26 @@ class FormDataDef(object):

def __init__(self, name, labels=None, has_stats=False, *args, **kwargs):
self.name = name
self.unique_name = name
self.use_unique_name = False
self.labels = labels or {}
self.value_names = self.get_value_names()
self.has_stats = has_stats

def __repr__(self):
return "<%s name='%s'>" % (self.__class__.__name__, self.name)
return "<%s name='%s'>" % (self.__class__.__name__, self.contextual_name)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

note: everywhere else, this is changed to contextual_name='%s'

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I have hesitated between using self.name for FormDataDef or changing the representation string because contextual_name is more related to FormField than its parent class.
Still wondering...


@property
def contextual_name(self):
if self.use_unique_name:
return self.unique_name
return self.name

@property
def value_names(self):
jnm marked this conversation as resolved.
Show resolved Hide resolved
return self.get_value_names()

def get_value_names(self):
return [self.name]
return [self.contextual_name]

@classmethod
def from_json_definition(cls, definition, translations=None):
Expand All @@ -48,6 +59,9 @@ def _extract_json_labels(cls, definition, translations):
labels = {}
return labels

def create_unique_name(self, suffix):
pass


class FormGroup(FormDataDef): # useful to get __repr__
pass
Expand Down Expand Up @@ -79,7 +93,7 @@ def from_json_definition(cls, definition, hierarchy=(None,), parent=None,
return cls(definition['name'], labels, hierarchy=hierarchy, parent=parent)

def get_label(self, lang=UNSPECIFIED_TRANSLATION):
return [self.labels.get(lang) or self.name]
return [self.labels.get(lang) or self.contextual_name]

def __repr__(self):
parent_name = getattr(self.parent, 'name', None)
Expand Down
Loading