Skip to content

Commit

Permalink
fixup! [a r] Index dataset description from Terra API (#5547)
Browse files Browse the repository at this point in the history
  • Loading branch information
nadove-ucsc committed Nov 1, 2023
1 parent 49ef44c commit c0f92b9
Show file tree
Hide file tree
Showing 6 changed files with 27 additions and 13 deletions.
5 changes: 5 additions & 0 deletions environment.py
Original file line number Diff line number Diff line change
Expand Up @@ -618,6 +618,11 @@ def env() -> Mapping[str, Optional[str]]:
#
'AZUL_SAM_SERVICE_URL': None,

# The URL of Terra's DUOS service from which to index descriptions of
# AnVIL datasets. If left unset, this step is skipped during indexing.
#
'AZUL_DUOS_SERVICE_URL': None,

# OAuth2 Client ID to be used for authenticating users. See section
# 3.2 of the README
#
Expand Down
5 changes: 3 additions & 2 deletions src/azul/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -323,8 +323,9 @@ def sam_service_url(self) -> mutable_furl:
return furl(self.environ['AZUL_SAM_SERVICE_URL'])

@property
def duos_service_url(self) -> mutable_furl:
return furl(self.environ['AZUL_DUOS_SERVICE_URL'])
def duos_service_url(self) -> Optional[mutable_furl]:
url = self.environ.get('AZUL_DUOS_SERVICE_URL')
return None if url is None else furl(url)

@property
def dss_query_prefix(self) -> str:
Expand Down
6 changes: 5 additions & 1 deletion src/azul/indexer/index_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,9 @@
config,
freeze,
)
from azul.collections import (
deep_dict_merge,
)
from azul.deployment import (
aws,
)
Expand Down Expand Up @@ -718,7 +721,8 @@ def _select_latest(self,
collated_entities.get(entity_id, (None, '', None))
if cur_entity is not None and entity.keys() != cur_entity.keys():
if cur_bundle_version == contribution.coordinates.bundle.version:
assert False, contribution.coordinates
assert contribution.entity.entity_type == 'datasets', contribution
entity = deep_dict_merge((entity, cur_entity))
else:
symmetric_difference = set(entity.keys()).symmetric_difference(cur_entity)
log.warning('Document shape of `%s` entity `%s` does not match between bundles '
Expand Down
10 changes: 5 additions & 5 deletions src/azul/plugins/metadata/anvil/indexer/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ def field_types(cls) -> FieldTypes:
return {
'activities': cls._activity_types(),
'biosamples': cls._biosample_types(),
'datasets': {**cls._dataset_types(), **cls._sparse_dataset_types()},
'datasets': {**cls._dataset_types(), **cls._duos_types()},
'diagnoses': cls._diagnosis_types(),
'donors': cls._donor_types(),
'files': cls._aggregate_file_types(),
Expand Down Expand Up @@ -222,7 +222,7 @@ def _biosample_types(cls) -> FieldTypes:
}

@classmethod
def _sparse_dataset_types(cls) -> FieldTypes:
def _duos_types(cls) -> FieldTypes:
return {
'document_id': null_str,
'description': null_str,
Expand Down Expand Up @@ -377,8 +377,8 @@ def _biosample(self, biosample: EntityReference) -> MutableJSON:
def _dataset(self, dataset: EntityReference) -> MutableJSON:
return self._entity(dataset, self._dataset_types())

def _sparse_dataset(self, dataset: EntityReference) -> MutableJSON:
return self._entity(dataset, self._sparse_dataset_types())
def _duos(self, dataset: EntityReference) -> MutableJSON:
return self._entity(dataset, self._duos_types())

def _diagnosis(self, diagnosis: EntityReference) -> MutableJSON:
return self._entity(diagnosis,
Expand Down Expand Up @@ -459,7 +459,7 @@ def _transform(self, entity: EntityReference) -> Contribution:
try:
dataset = self._dataset(entity)
except KeyError:
contents = dict(datasets=[self._sparse_dataset(entity)])
contents = dict(datasets=[self._duos(entity)])
else:
contents = dict(
activities=self._entities(self._activity, chain.from_iterable(
Expand Down
12 changes: 8 additions & 4 deletions src/azul/plugins/repository/tdr_anvil/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ class BundleEntityType(Enum):
"""
primary: EntityType = 'biosample'
supplementary: EntityType = 'file'
dataset: EntityType = 'dataset'
duos: EntityType = 'dataset'


class AnvilBundleFQIDJSON(SourcedBundleFQIDJSON):
Expand Down Expand Up @@ -191,7 +191,7 @@ def _list_bundles(self,
validate_uuid_prefix(partition_prefix)
primary = BundleEntityType.primary.value
supplementary = BundleEntityType.supplementary.value
dataset = BundleEntityType.dataset.value
dataset = BundleEntityType.duos.value
rows = list(self._run_sql(f'''
SELECT datarepo_row_id, {primary!r} AS entity_type
FROM {backtick(self._full_table_name(spec, primary))}
Expand All @@ -200,10 +200,13 @@ def _list_bundles(self,
SELECT datarepo_row_id, {supplementary!r} AS entity_type
FROM {backtick(self._full_table_name(spec, supplementary))} AS supp
WHERE supp.is_supplementary AND STARTS_WITH(datarepo_row_id, '{partition_prefix}')
''' + (
'' if config.duos_service_url is None else f'''
UNION ALL
SELECT datarepo_row_id, {dataset!r} AS entity_type
FROM {backtick(self._full_table_name(spec, dataset))}
'''))
'''
)))
bundles = []
dataset_count = 0
for row in rows:
Expand Down Expand Up @@ -280,7 +283,8 @@ def _emulate_bundle(self, bundle_fqid: AnvilBundleFQID) -> TDRAnvilBundle:
elif bundle_fqid.entity_type is BundleEntityType.supplementary:
log.info('Bundle %r is a supplementary bundle', bundle_fqid.uuid)
return self._supplementary_bundle(bundle_fqid)
elif bundle_fqid.entity_type is BundleEntityType.dataset:
elif bundle_fqid.entity_type is BundleEntityType.duos:
assert config.duos_service_url is not None, bundle_fqid
log.info('Bundle %r is a dataset description', bundle_fqid.uuid)
return self._dataset_description(bundle_fqid)
else:
Expand Down
2 changes: 1 addition & 1 deletion test/indexer/test_anvil.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,7 +102,7 @@ def test_dataset_description(self):
entity_id='2370f948-2783-4eb6-afea-e022897f4dcf')
dataset_bundle = self.bundle_fqid(uuid='2370f948-2783-aeb6-afea-e022897f4dcf',
version=self.bundle.version,
entity_type=BundleEntityType.dataset)
entity_type=BundleEntityType.duos)

bundles = [self.bundle, dataset_bundle]
for bundle_fqid in bundles:
Expand Down

0 comments on commit c0f92b9

Please sign in to comment.