diff --git a/environment.py b/environment.py index 1bbb5dfd03..6185fcf082 100644 --- a/environment.py +++ b/environment.py @@ -618,6 +618,11 @@ def env() -> Mapping[str, Optional[str]]: # 'AZUL_SAM_SERVICE_URL': None, + # The URL of Terra's DUOS service from which to index descriptions of + # AnVIL datasets. If left unset, this step is skipped during indexing. + # + 'AZUL_DUOS_SERVICE_URL': None, + # OAuth2 Client ID to be used for authenticating users. See section # 3.2 of the README # diff --git a/src/azul/__init__.py b/src/azul/__init__.py index 678c4d15c1..98d6377136 100644 --- a/src/azul/__init__.py +++ b/src/azul/__init__.py @@ -323,8 +323,9 @@ def sam_service_url(self) -> mutable_furl: return furl(self.environ['AZUL_SAM_SERVICE_URL']) @property - def duos_service_url(self) -> mutable_furl: - return furl(self.environ['AZUL_DUOS_SERVICE_URL']) + def duos_service_url(self) -> Optional[mutable_furl]: + url = self.environ['AZUL_DUOS_SERVICE_URL'] + return None if url is None else furl(url) @property def dss_query_prefix(self) -> str: diff --git a/src/azul/indexer/index_service.py b/src/azul/indexer/index_service.py index aceb68df1b..a3ecf70fe6 100644 --- a/src/azul/indexer/index_service.py +++ b/src/azul/indexer/index_service.py @@ -44,6 +44,9 @@ config, freeze, ) +from azul.collections import ( + deep_dict_merge, +) from azul.deployment import ( aws, ) @@ -718,7 +721,8 @@ def _select_latest(self, collated_entities.get(entity_id, (None, '', None)) if cur_entity is not None and entity.keys() != cur_entity.keys(): if cur_bundle_version == contribution.coordinates.bundle.version: - assert False, contribution.coordinates + assert contribution.entity.entity_type == 'datasets', contribution + entity = deep_dict_merge((entity, cur_entity)) else: symmetric_difference = set(entity.keys()).symmetric_difference(cur_entity) log.warning('Document shape of `%s` entity `%s` does not match between bundles ' diff --git a/src/azul/plugins/metadata/anvil/indexer/transform.py b/src/azul/plugins/metadata/anvil/indexer/transform.py index 2bbda3f74e..88b7a436a2 100644 --- a/src/azul/plugins/metadata/anvil/indexer/transform.py +++ b/src/azul/plugins/metadata/anvil/indexer/transform.py @@ -125,7 +125,7 @@ def field_types(cls) -> FieldTypes: return { 'activities': cls._activity_types(), 'biosamples': cls._biosample_types(), - 'datasets': {**cls._dataset_types(), **cls._sparse_dataset_types()}, + 'datasets': {**cls._dataset_types(), **cls._duos_types()}, 'diagnoses': cls._diagnosis_types(), 'donors': cls._donor_types(), 'files': cls._aggregate_file_types(), @@ -222,7 +222,7 @@ def _biosample_types(cls) -> FieldTypes: } @classmethod - def _sparse_dataset_types(cls) -> FieldTypes: + def _duos_types(cls) -> FieldTypes: return { 'document_id': null_str, 'description': null_str, @@ -377,8 +377,8 @@ def _biosample(self, biosample: EntityReference) -> MutableJSON: def _dataset(self, dataset: EntityReference) -> MutableJSON: return self._entity(dataset, self._dataset_types()) - def _sparse_dataset(self, dataset: EntityReference) -> MutableJSON: - return self._entity(dataset, self._sparse_dataset_types()) + def _duos(self, dataset: EntityReference) -> MutableJSON: + return self._entity(dataset, self._duos_types()) def _diagnosis(self, diagnosis: EntityReference) -> MutableJSON: return self._entity(diagnosis, @@ -459,7 +459,7 @@ def _transform(self, entity: EntityReference) -> Contribution: try: dataset = self._dataset(entity) except KeyError: - contents = dict(datasets=[self._sparse_dataset(entity)]) + contents = dict(datasets=[self._duos(entity)]) else: contents = dict( activities=self._entities(self._activity, chain.from_iterable( diff --git a/src/azul/plugins/repository/tdr_anvil/__init__.py b/src/azul/plugins/repository/tdr_anvil/__init__.py index c98e72ca63..c6e23d440b 100644 --- a/src/azul/plugins/repository/tdr_anvil/__init__.py +++ b/src/azul/plugins/repository/tdr_anvil/__init__.py @@ -113,7 +113,7 @@ class BundleEntityType(Enum): """ primary: EntityType = 'biosample' supplementary: EntityType = 'file' - dataset: EntityType = 'dataset' + duos: EntityType = 'dataset' class AnvilBundleFQIDJSON(SourcedBundleFQIDJSON): @@ -191,7 +191,7 @@ def _list_bundles(self, validate_uuid_prefix(partition_prefix) primary = BundleEntityType.primary.value supplementary = BundleEntityType.supplementary.value - dataset = BundleEntityType.dataset.value + dataset = BundleEntityType.duos.value rows = list(self._run_sql(f''' SELECT datarepo_row_id, {primary!r} AS entity_type FROM {backtick(self._full_table_name(spec, primary))} @@ -200,10 +200,13 @@ def _list_bundles(self, SELECT datarepo_row_id, {supplementary!r} AS entity_type FROM {backtick(self._full_table_name(spec, supplementary))} AS supp WHERE supp.is_supplementary AND STARTS_WITH(datarepo_row_id, '{partition_prefix}') + ''' + ( + '' if config.duos_service_url is None else f''' UNION ALL SELECT datarepo_row_id, {dataset!r} AS entity_type FROM {backtick(self._full_table_name(spec, dataset))} - ''')) + ''' + ))) bundles = [] dataset_count = 0 for row in rows: @@ -280,7 +283,8 @@ def _emulate_bundle(self, bundle_fqid: AnvilBundleFQID) -> TDRAnvilBundle: elif bundle_fqid.entity_type is BundleEntityType.supplementary: log.info('Bundle %r is a supplementary bundle', bundle_fqid.uuid) return self._supplementary_bundle(bundle_fqid) - elif bundle_fqid.entity_type is BundleEntityType.dataset: + elif bundle_fqid.entity_type is BundleEntityType.duos: + assert config.duos_service_url is not None, bundle_fqid log.info('Bundle %r is a dataset description', bundle_fqid.uuid) return self._dataset_description(bundle_fqid) else: diff --git a/test/indexer/test_anvil.py b/test/indexer/test_anvil.py index cae2520cf1..03b1b965ba 100644 --- a/test/indexer/test_anvil.py +++ b/test/indexer/test_anvil.py @@ -102,7 +102,7 @@ def test_dataset_description(self): entity_id='2370f948-2783-4eb6-afea-e022897f4dcf') dataset_bundle = self.bundle_fqid(uuid='2370f948-2783-aeb6-afea-e022897f4dcf', version=self.bundle.version, - entity_type=BundleEntityType.dataset) + entity_type=BundleEntityType.duos) bundles = [self.bundle, dataset_bundle] for bundle_fqid in bundles: