Skip to content

Commit

Permalink
fixup! [a r] Index dataset description from Terra API (#5547)
Browse files Browse the repository at this point in the history
  • Loading branch information
nadove-ucsc committed Nov 9, 2023
1 parent f438852 commit 6d19727
Show file tree
Hide file tree
Showing 4 changed files with 21 additions and 11 deletions.
16 changes: 12 additions & 4 deletions src/azul/plugins/metadata/anvil/indexer/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
attrgetter,
)
from typing import (
AbstractSet,
Callable,
Collection,
Iterable,
Expand All @@ -34,6 +35,7 @@

from azul import (
JSON,
cache,
)
from azul.collections import (
deep_dict_merge,
Expand Down Expand Up @@ -431,13 +433,15 @@ def reconcile_inner_entities(cls,
) -> tuple[JSON, BundleFQID]:
this_entity, this_bundle = this
that_entity, that_bundle = that
# All AnVIL bundles use a fixed known version
assert this_bundle.version == that_bundle.version, (this, that)
if this_entity.keys() == that_entity.keys():
return that if that_bundle.version > this_bundle.version else this
return this
else:
assert entity_type == 'datasets', (entity_type, this, that)
expected_keys = cls.field_types()[entity_type].keys()
expected_keys = cls._complete_dataset_keys()
# There will be one contribution for a DUOS stub, and many redundant
# contributions (one per non-duos bundle) for the dataset metadata
# contributions (one per non-DUOS bundle) for the dataset metadata
# from BigQuery. Once the stub has been merged with a single main
# contribution to consolidate all expected fields, we can disregard
# the other contributions as usual.
Expand All @@ -446,7 +450,6 @@ def reconcile_inner_entities(cls,
elif that_entity.keys() == expected_keys:
return that
else:
assert this_bundle.version == that_bundle.version, (this, that)
assert this_entity.keys() < expected_keys, this
assert that_entity.keys() < expected_keys, that
merged = deep_dict_merge((this_entity, that_entity))
Expand All @@ -455,6 +458,11 @@ def reconcile_inner_entities(cls,
# used by the caller, and we know the versions are equal.
return merged, this_bundle

@classmethod
@cache
def _complete_dataset_keys(cls) -> AbstractSet[str]:
return cls.field_types()['datasets'].keys()


class ActivityTransformer(BaseTransformer):

Expand Down
12 changes: 7 additions & 5 deletions src/azul/plugins/repository/tdr_anvil/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -191,7 +191,7 @@ def _list_bundles(self,
validate_uuid_prefix(partition_prefix)
primary = BundleEntityType.primary.value
supplementary = BundleEntityType.supplementary.value
dataset = BundleEntityType.duos.value
duos = BundleEntityType.duos.value
rows = list(self._run_sql(f'''
SELECT datarepo_row_id, {primary!r} AS entity_type
FROM {backtick(self._full_table_name(spec, primary))}
Expand All @@ -201,10 +201,12 @@ def _list_bundles(self,
FROM {backtick(self._full_table_name(spec, supplementary))} AS supp
WHERE supp.is_supplementary AND STARTS_WITH(datarepo_row_id, '{partition_prefix}')
''' + (
'' if config.duos_service_url is None else f'''
''
if config.duos_service_url is None else
f'''
UNION ALL
SELECT datarepo_row_id, {dataset!r} AS entity_type
FROM {backtick(self._full_table_name(spec, dataset))}
SELECT datarepo_row_id, {duos!r} AS entity_type
FROM {backtick(self._full_table_name(spec, duos))}
'''
)))
bundles = []
Expand All @@ -215,7 +217,7 @@ def _list_bundles(self,
# single dataset. This verification is performed independently and
# concurrently for every partition, but only one partition actually
# emits the bundle.
if row['entity_type'] == dataset:
if row['entity_type'] == duos:
require(0 == dataset_count)
dataset_count += 1
if not row['datarepo_row_id'].startswith(partition_prefix):
Expand Down
2 changes: 1 addition & 1 deletion test/indexer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -147,7 +147,7 @@ def _get_all_hits(self):
and doc_type is DocumentType.contribution
and 'description' in one(hit['_source']['contents']['datasets'])
):
# Sparse dataset contributions contain no lists
# DUOS contributions contain no lists
continue
self._verify_sorted_lists(hit['_source'])
return hits
Expand Down
2 changes: 1 addition & 1 deletion test/indexer/test_anvil.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def test_dataset_description(self):
# These fields are populated only in the primary bundle
self.assertEqual(dataset_ref.entity_id, contents['document_id'])
self.assertEqual(['phs000693'], contents['registered_identifier'])
# This field is populated only in the sparse dataset bundle
# This field is populated only in the DUOS bundle
self.assertEqual('Study description from DUOS', contents['description'])
self.assertEqual(1, doc_counts[DocumentType.aggregate])
self.assertEqual(2, doc_counts[DocumentType.contribution])
Expand Down

0 comments on commit 6d19727

Please sign in to comment.