From 2eedaa7865d082284674012a9d3a5223c11bf45e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=C5=81azowik?= Date: Sat, 2 May 2020 02:12:34 +0200 Subject: [PATCH 1/9] Improve perforamance of _pluck_uniq_cols Use a set instead of an OrderedDict with empty values for gathering unique values. O(n^2) -> O(nlogn) --- src/formpack/utils/expand_content.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/formpack/utils/expand_content.py b/src/formpack/utils/expand_content.py index eb271cec..8015ee4f 100644 --- a/src/formpack/utils/expand_content.py +++ b/src/formpack/utils/expand_content.py @@ -158,7 +158,7 @@ def _get_special_survey_cols(content): 'hint::English', For more examples, see tests. """ - uniq_cols = OrderedDict() + uniq_cols = set() special = OrderedDict() known_translated_cols = content.get('translated', []) @@ -169,7 +169,7 @@ def _pluck_uniq_cols(sheet_name): # to be parsed and translated in a previous iteration _cols = [r for r in row.keys() if r not in known_translated_cols] - uniq_cols.update(OrderedDict.fromkeys(_cols)) + uniq_cols.update(_cols) def _mark_special(**kwargs): column_name = kwargs.pop('column_name') @@ -178,7 +178,7 @@ def _mark_special(**kwargs): _pluck_uniq_cols('survey') _pluck_uniq_cols('choices') - for column_name in uniq_cols.keys(): + for column_name in uniq_cols: if column_name in ['label', 'hint']: _mark_special(column_name=column_name, column=column_name, From 7de10695dcde0c280d9e72f99c62f75b9f36e33e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=C5=81azowik?= Date: Sat, 2 May 2020 02:24:25 +0200 Subject: [PATCH 2/9] Fix: preserve order of columns in _get_special_survey_cols --- src/formpack/utils/expand_content.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/formpack/utils/expand_content.py b/src/formpack/utils/expand_content.py index 8015ee4f..8bf309a6 100644 --- a/src/formpack/utils/expand_content.py +++ b/src/formpack/utils/expand_content.py @@ -178,7 +178,7 @@ def _mark_special(**kwargs): _pluck_uniq_cols('survey') _pluck_uniq_cols('choices') - for column_name in uniq_cols: + for column_name in sorted(uniq_cols): if column_name in ['label', 'hint']: _mark_special(column_name=column_name, column=column_name, From 68987d1d86fe928ff17ed5856198fa17a9df7980 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=C5=81azowik?= Date: Sat, 2 May 2020 02:47:46 +0200 Subject: [PATCH 3/9] Fix: preserve the original order of columns instead of sorting --- src/formpack/utils/expand_content.py | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/src/formpack/utils/expand_content.py b/src/formpack/utils/expand_content.py index 8bf309a6..0144fa2e 100644 --- a/src/formpack/utils/expand_content.py +++ b/src/formpack/utils/expand_content.py @@ -158,7 +158,8 @@ def _get_special_survey_cols(content): 'hint::English', For more examples, see tests. """ - uniq_cols = set() + uniq_cols_unordered = set() + uniq_cols = [] special = OrderedDict() known_translated_cols = content.get('translated', []) @@ -169,7 +170,11 @@ def _pluck_uniq_cols(sheet_name): # to be parsed and translated in a previous iteration _cols = [r for r in row.keys() if r not in known_translated_cols] - uniq_cols.update(_cols) + for _col in _cols: + if _col in uniq_cols_unordered: + continue + uniq_cols_unordered.add(_col) + uniq_cols.append(_col) def _mark_special(**kwargs): column_name = kwargs.pop('column_name') @@ -178,7 +183,7 @@ def _mark_special(**kwargs): _pluck_uniq_cols('survey') _pluck_uniq_cols('choices') - for column_name in sorted(uniq_cols): + for column_name in uniq_cols: if column_name in ['label', 'hint']: _mark_special(column_name=column_name, column=column_name, @@ -218,7 +223,7 @@ def _mark_special(**kwargs): translation=matched[1]) # also add the empty column if it exists - if column_shortname in uniq_cols: + if column_shortname in uniq_cols_unordered: _mark_special(column_name=column_shortname, column=column_shortname, translation=UNTRANSLATED) From db116cc095822b955020da8ade00edcaf2235c7a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=C5=81azowik?= Date: Sat, 2 May 2020 03:35:33 +0200 Subject: [PATCH 4/9] Add comment explaining the unusual code --- src/formpack/utils/expand_content.py | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/src/formpack/utils/expand_content.py b/src/formpack/utils/expand_content.py index 0144fa2e..e1dfe825 100644 --- a/src/formpack/utils/expand_content.py +++ b/src/formpack/utils/expand_content.py @@ -158,8 +158,18 @@ def _get_special_survey_cols(content): 'hint::English', For more examples, see tests. """ - uniq_cols_unordered = set() + uniq_cols_set = set() uniq_cols = [] + """ + The reason for two separate data structures is performance. The goal is to have a unique + set that preserves insertion order. + + We implement that by using set() for uniqueness and list() for order. + + Python has OrderedDict that provides that functionality, but the performance is slightly + worse compared to this solution. + """ + special = OrderedDict() known_translated_cols = content.get('translated', []) @@ -171,9 +181,9 @@ def _pluck_uniq_cols(sheet_name): _cols = [r for r in row.keys() if r not in known_translated_cols] for _col in _cols: - if _col in uniq_cols_unordered: + if _col in uniq_cols_set: continue - uniq_cols_unordered.add(_col) + uniq_cols_set.add(_col) uniq_cols.append(_col) def _mark_special(**kwargs): @@ -223,7 +233,7 @@ def _mark_special(**kwargs): translation=matched[1]) # also add the empty column if it exists - if column_shortname in uniq_cols_unordered: + if column_shortname in uniq_cols_set: _mark_special(column_name=column_shortname, column=column_shortname, translation=UNTRANSLATED) From 3949477a2b152a0f338395348323a73880004e83 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=C5=81azowik?= Date: Sat, 2 May 2020 03:41:41 +0200 Subject: [PATCH 5/9] Use an existing OrderedSet library instead of custom implementation --- setup.py | 2 ++ src/formpack/utils/expand_content.py | 21 ++++----------------- 2 files changed, 6 insertions(+), 17 deletions(-) diff --git a/setup.py b/setup.py index 27333f16..0cf78c66 100644 --- a/setup.py +++ b/setup.py @@ -18,6 +18,7 @@ 'path.py<12', # Pinned for Python 2 compatibility 'pyquery', 'pyxform', + 'orderedset', 'statistics', 'XlsxWriter', 'backports.csv', # Remove after dropping Python 2 support (and rewrite `imports`) @@ -41,6 +42,7 @@ 'path.py', 'pyquery', 'pyxform', + 'orderedset', 'statistics', 'XlsxWriter', 'backports.csv', # Remove after dropping Python 2 support (and rewrite `imports`) diff --git a/src/formpack/utils/expand_content.py b/src/formpack/utils/expand_content.py index e1dfe825..b6d0f8fd 100644 --- a/src/formpack/utils/expand_content.py +++ b/src/formpack/utils/expand_content.py @@ -6,6 +6,7 @@ from __future__ import (unicode_literals, print_function, absolute_import, division) from copy import deepcopy +from orderedset import OrderedSet import re from .array_to_xpath import EXPANDABLE_FIELD_TYPES @@ -158,17 +159,7 @@ def _get_special_survey_cols(content): 'hint::English', For more examples, see tests. """ - uniq_cols_set = set() - uniq_cols = [] - """ - The reason for two separate data structures is performance. The goal is to have a unique - set that preserves insertion order. - - We implement that by using set() for uniqueness and list() for order. - - Python has OrderedDict that provides that functionality, but the performance is slightly - worse compared to this solution. - """ + uniq_cols = OrderedSet() special = OrderedDict() @@ -180,11 +171,7 @@ def _pluck_uniq_cols(sheet_name): # to be parsed and translated in a previous iteration _cols = [r for r in row.keys() if r not in known_translated_cols] - for _col in _cols: - if _col in uniq_cols_set: - continue - uniq_cols_set.add(_col) - uniq_cols.append(_col) + uniq_cols.update(_cols) def _mark_special(**kwargs): column_name = kwargs.pop('column_name') @@ -233,7 +220,7 @@ def _mark_special(**kwargs): translation=matched[1]) # also add the empty column if it exists - if column_shortname in uniq_cols_set: + if column_shortname in uniq_cols: _mark_special(column_name=column_shortname, column=column_shortname, translation=UNTRANSLATED) From 91e90111f1761308e82740728684eee26d50407d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=C5=81azowik?= Date: Sat, 2 May 2020 04:36:51 +0200 Subject: [PATCH 6/9] Revert "Use an existing OrderedSet library instead of custom implementation" This reverts commit 3949477a2b152a0f338395348323a73880004e83. --- setup.py | 2 -- src/formpack/utils/expand_content.py | 21 +++++++++++++++++---- 2 files changed, 17 insertions(+), 6 deletions(-) diff --git a/setup.py b/setup.py index 0cf78c66..27333f16 100644 --- a/setup.py +++ b/setup.py @@ -18,7 +18,6 @@ 'path.py<12', # Pinned for Python 2 compatibility 'pyquery', 'pyxform', - 'orderedset', 'statistics', 'XlsxWriter', 'backports.csv', # Remove after dropping Python 2 support (and rewrite `imports`) @@ -42,7 +41,6 @@ 'path.py', 'pyquery', 'pyxform', - 'orderedset', 'statistics', 'XlsxWriter', 'backports.csv', # Remove after dropping Python 2 support (and rewrite `imports`) diff --git a/src/formpack/utils/expand_content.py b/src/formpack/utils/expand_content.py index b6d0f8fd..e1dfe825 100644 --- a/src/formpack/utils/expand_content.py +++ b/src/formpack/utils/expand_content.py @@ -6,7 +6,6 @@ from __future__ import (unicode_literals, print_function, absolute_import, division) from copy import deepcopy -from orderedset import OrderedSet import re from .array_to_xpath import EXPANDABLE_FIELD_TYPES @@ -159,7 +158,17 @@ def _get_special_survey_cols(content): 'hint::English', For more examples, see tests. """ - uniq_cols = OrderedSet() + uniq_cols_set = set() + uniq_cols = [] + """ + The reason for two separate data structures is performance. The goal is to have a unique + set that preserves insertion order. + + We implement that by using set() for uniqueness and list() for order. + + Python has OrderedDict that provides that functionality, but the performance is slightly + worse compared to this solution. + """ special = OrderedDict() @@ -171,7 +180,11 @@ def _pluck_uniq_cols(sheet_name): # to be parsed and translated in a previous iteration _cols = [r for r in row.keys() if r not in known_translated_cols] - uniq_cols.update(_cols) + for _col in _cols: + if _col in uniq_cols_set: + continue + uniq_cols_set.add(_col) + uniq_cols.append(_col) def _mark_special(**kwargs): column_name = kwargs.pop('column_name') @@ -220,7 +233,7 @@ def _mark_special(**kwargs): translation=matched[1]) # also add the empty column if it exists - if column_shortname in uniq_cols: + if column_shortname in uniq_cols_set: _mark_special(column_name=column_shortname, column=column_shortname, translation=UNTRANSLATED) From 23e53be22ff0be0a19a3675367532fbd0b38a7ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=C5=81azowik?= Date: Sat, 2 May 2020 04:55:19 +0200 Subject: [PATCH 7/9] Extract OrderedSet() --- src/formpack/utils/expand_content.py | 22 +++----------- src/formpack/utils/ordered_set.py | 44 ++++++++++++++++++++++++++++ 2 files changed, 48 insertions(+), 18 deletions(-) create mode 100644 src/formpack/utils/ordered_set.py diff --git a/src/formpack/utils/expand_content.py b/src/formpack/utils/expand_content.py index e1dfe825..fc8a4322 100644 --- a/src/formpack/utils/expand_content.py +++ b/src/formpack/utils/expand_content.py @@ -11,6 +11,7 @@ from .array_to_xpath import EXPANDABLE_FIELD_TYPES from .future import iteritems, OrderedDict from .iterator import get_first_occurrence +from .ordered_set import OrderedSet from .replace_aliases import META_TYPES from .string import str_types from ..constants import (UNTRANSLATED, OR_OTHER_COLUMN, @@ -158,18 +159,7 @@ def _get_special_survey_cols(content): 'hint::English', For more examples, see tests. """ - uniq_cols_set = set() - uniq_cols = [] - """ - The reason for two separate data structures is performance. The goal is to have a unique - set that preserves insertion order. - - We implement that by using set() for uniqueness and list() for order. - - Python has OrderedDict that provides that functionality, but the performance is slightly - worse compared to this solution. - """ - + uniq_cols = OrderedSet() special = OrderedDict() known_translated_cols = content.get('translated', []) @@ -180,11 +170,7 @@ def _pluck_uniq_cols(sheet_name): # to be parsed and translated in a previous iteration _cols = [r for r in row.keys() if r not in known_translated_cols] - for _col in _cols: - if _col in uniq_cols_set: - continue - uniq_cols_set.add(_col) - uniq_cols.append(_col) + uniq_cols.update(_cols) def _mark_special(**kwargs): column_name = kwargs.pop('column_name') @@ -233,7 +219,7 @@ def _mark_special(**kwargs): translation=matched[1]) # also add the empty column if it exists - if column_shortname in uniq_cols_set: + if column_shortname in uniq_cols: _mark_special(column_name=column_shortname, column=column_shortname, translation=UNTRANSLATED) diff --git a/src/formpack/utils/ordered_set.py b/src/formpack/utils/ordered_set.py new file mode 100644 index 00000000..dc3bb453 --- /dev/null +++ b/src/formpack/utils/ordered_set.py @@ -0,0 +1,44 @@ +import collections + + +class OrderedSet(collections.MutableSet): + def __init__(self, iterable=None): + self.set = set() + self.list = [] + if iterable is not None: + self |= iterable + + def __len__(self): + return len(self.list) + + def __contains__(self, key): + return key in self.set + + def add(self, key): + if key not in self.set: + set.add(key) + list.append(key) + + def update(self, keys): + map(self.add, keys) + + def discard(self, key): + if key in self.set: + self.set.discard(key) + self.list.remove(key) + + def __iter__(self): + curr = 0 + while curr < len(self.list): + yield self.list[curr] + curr += 1 + + def __repr__(self): + if not self: + return '%s()' % (self.__class__.__name__,) + return '%s(%r)' % (self.__class__.__name__, list(self)) + + def __eq__(self, other): + if isinstance(other, OrderedSet): + return len(self) == len(other) and list(self) == list(other) + return set(self) == set(other) From c431df9b83220da222b42ad20859ab1d037794be Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=C5=81azowik?= Date: Sat, 2 May 2020 04:57:52 +0200 Subject: [PATCH 8/9] Fix: OrderedSet.add --- src/formpack/utils/ordered_set.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/formpack/utils/ordered_set.py b/src/formpack/utils/ordered_set.py index dc3bb453..b9c5c085 100644 --- a/src/formpack/utils/ordered_set.py +++ b/src/formpack/utils/ordered_set.py @@ -16,8 +16,8 @@ def __contains__(self, key): def add(self, key): if key not in self.set: - set.add(key) - list.append(key) + self.set.add(key) + self.list.append(key) def update(self, keys): map(self.add, keys) From b37f9e6213d0defec5d072bf36fa0b93f4039a9a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Micha=C5=82=20=C5=81azowik?= Date: Sat, 2 May 2020 05:01:46 +0200 Subject: [PATCH 9/9] Fix: OrderedSet.add order --- src/formpack/utils/ordered_set.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/formpack/utils/ordered_set.py b/src/formpack/utils/ordered_set.py index b9c5c085..53dd57b0 100644 --- a/src/formpack/utils/ordered_set.py +++ b/src/formpack/utils/ordered_set.py @@ -20,7 +20,8 @@ def add(self, key): self.list.append(key) def update(self, keys): - map(self.add, keys) + for key in keys: + self.add(key) def discard(self, key): if key in self.set: