Skip to content

Commit

Permalink
[p] Extract is_stitched from manifest entries (partial #6299)
Browse files Browse the repository at this point in the history
  • Loading branch information
nadove-ucsc committed Aug 27, 2024
1 parent 8ef878d commit 29214a9
Show file tree
Hide file tree
Showing 60 changed files with 2,050 additions and 2,253 deletions.
1 change: 0 additions & 1 deletion attic/scripts/recan_bundle_tdr.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,7 +177,6 @@ def dss_bundle_to_tdr(bundle: Bundle, source: TDRSourceRef) -> TDRHCABundle:
links_entry = None
for entry in manifest:
entry['version'] = convert_version(entry['version'])
entry['is_stitched'] = False
if entry['name'] == 'links.json':
links_entry = entry
if entry['indexed']:
Expand Down
3 changes: 2 additions & 1 deletion src/azul/plugins/metadata/hca/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,8 @@ def transformers(self,
version=bundle.version,
manifest=bundle.manifest,
metadata_files=bundle.metadata_files,
links_json=bundle.links)
links_json=bundle.links,
stitched_entity_ids=bundle.stitched)

def transformers():
for transformer_cls in self.transformer_types():
Expand Down
10 changes: 9 additions & 1 deletion src/azul/plugins/metadata/hca/bundle.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ class HCABundle(Bundle[BUNDLE_FQID], ABC):
"""
metadata_files: MutableJSON
links: MutableJSON
stitched: set[str] = attrs.field(factory=set)

def reject_joiner(self, catalog: CatalogName):
self._reject_joiner(self.manifest)
Expand All @@ -52,14 +53,21 @@ def to_json(self) -> MutableJSON:
'manifest': self.manifest,
'metadata': self.metadata_files,
'links': self.links,
'stitched': sorted(self.stitched)
}

@classmethod
def from_json(cls, fqid: BUNDLE_FQID, json_: JSON) -> 'Bundle':
manifest = json_['manifest']
metadata = json_['metadata']
links = json_['links']
stitched = json_['stitched']
assert isinstance(manifest, list), manifest
assert isinstance(metadata, dict), metadata
assert isinstance(links, dict), links
return cls(fqid=fqid, manifest=manifest, metadata_files=metadata, links=links)
assert isinstance(stitched, list), stitched
return cls(fqid=fqid,
manifest=manifest,
metadata_files=metadata,
links=links,
stitched=set(stitched))
6 changes: 3 additions & 3 deletions src/azul/plugins/metadata/hca/indexer/transform.py
Original file line number Diff line number Diff line change
Expand Up @@ -1428,7 +1428,7 @@ def entity_type(cls) -> str:
return 'files'

def _entities(self) -> Iterable[api.File]:
return api.not_stitched(self.api_bundle.files.values())
return self.api_bundle.not_stitched(self.api_bundle.files)

def _transform(self, files: Iterable[api.File]) -> Iterable[Contribution]:
zarr_stores: Mapping[str, list[api.File]] = self.group_zarrs(files)
Expand Down Expand Up @@ -1586,7 +1586,7 @@ def inner_entity_types(cls) -> frozenset[str]:

def _entities(self) -> Iterable[Sample]:
samples: dict[str, Sample] = dict()
for file in api.not_stitched(self.api_bundle.files.values()):
for file in self.api_bundle.not_stitched(self.api_bundle.files):
self._find_ancestor_samples(file, samples)
return samples.values()

Expand Down Expand Up @@ -1643,7 +1643,7 @@ def _singleton_entity(self) -> DatedEntity:
raise NotImplementedError

def _dated_entities(self) -> Iterable[DatedEntity]:
return api.not_stitched(self.api_bundle.entities.values())
return self.api_bundle.not_stitched(self.api_bundle.entities)

def estimate(self, partition: BundlePartition) -> int:
return int(partition.contains(self._singleton_id))
Expand Down
8 changes: 3 additions & 5 deletions src/azul/plugins/repository/tdr_hca/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,13 +189,14 @@ def add_entity(self,
row: BigQueryRow,
is_stitched: bool
) -> None:
if is_stitched:
self.stitched.add(entity.entity_id)
self._add_manifest_entry(name=entity_key,
uuid=entity.entity_id,
version=TDRPlugin.format_version(row['version']),
size=row['content_size'],
content_type='application/json',
dcp_type=f'"metadata/{row["schema_type"]}"',
is_stitched=is_stitched)
dcp_type=f'"metadata/{row["schema_type"]}"')
if entity.entity_type.endswith('_file'):
descriptor = json.loads(row['descriptor'])
self._add_manifest_entry(name=row['file_name'],
Expand All @@ -204,7 +205,6 @@ def add_entity(self,
size=descriptor['size'],
content_type=descriptor['content_type'],
dcp_type='data',
is_stitched=is_stitched,
checksums=Checksums.from_json(descriptor),
drs_uri=self._parse_drs_uri(row['file_id'], descriptor))
content = row['content']
Expand Down Expand Up @@ -241,7 +241,6 @@ def _add_manifest_entry(self,
size: int,
content_type: str,
dcp_type: str,
is_stitched: bool,
checksums: Optional[Checksums] = None,
drs_uri: Optional[str] = None) -> None:

Expand All @@ -251,7 +250,6 @@ def _add_manifest_entry(self,
'version': version,
'content-type': f'{content_type}; dcp-type={dcp_type}',
'size': size,
'is_stitched': is_stitched,
**(
{
'indexed': True,
Expand Down
36 changes: 11 additions & 25 deletions src/humancellatlas/data/metadata/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,8 +18,8 @@
chain,
)
from typing import (
AbstractSet,
Iterable,
Iterator,
Mapping,
MutableMapping,
Optional,
Expand Down Expand Up @@ -77,7 +77,6 @@ class ManifestEntry:
# FIXME: Change Bundle.version and ManifestEntry.version from string to datetime
# https://github.com/DataBiosphere/hca-metadata-api/issues/48
version: str
is_stitched: bool = field(init=False)

def __init__(self, json: MutableJSON):
# '/' was once forbidden in file paths and was encoded with '!'. Now
Expand All @@ -87,7 +86,6 @@ def __init__(self, json: MutableJSON):
self.json = json
self.content_type = json['content-type']
self.uuid = UUID4(json['uuid'])
self.is_stitched = json.get('is_stitched', False)
for f in fields(self):
if f.init:
value = json.get(f.name)
Expand All @@ -106,13 +104,6 @@ class Entity:
submission_date: datetime
update_date: Optional[datetime]

@property
def is_stitched(self):
if self.metadata_manifest_entry is None:
return False
else:
return self.metadata_manifest_entry.is_stitched

@classmethod
def from_json(cls,
json: JSON,
Expand Down Expand Up @@ -170,20 +161,6 @@ def accept(self, visitor: 'EntityVisitor') -> None:
E = TypeVar('E', bound=Entity)


# noinspection PyPep8Naming
@dataclass(frozen=True)
class not_stitched(Iterable[E]):
"""
An iterable of the entities in the argument iterable that are not stitched.
This is an iterable, so it can be consumed repeatedly.
"""

entities: Iterable[E]

def __iter__(self) -> Iterator[E]:
return (e for e in self.entities if not e.is_stitched)


class TypeLookupError(Exception):

def __init__(self, described_by: str) -> None:
Expand Down Expand Up @@ -1001,10 +978,12 @@ def __init__(self,
version: str,
manifest: MutableJSONs,
metadata_files: Mapping[str, JSON],
links_json: JSON):
links_json: JSON,
stitched_entity_ids: AbstractSet[str] = frozenset()):
self.uuid = UUID4(uuid)
self.version = version
self.manifest = {m.name: m for m in map(ManifestEntry, manifest)}
self.stitched = stitched_entity_ids

json_by_core_cls: MutableMapping[type[E], list[tuple[JSON, ManifestEntry]]] = defaultdict(list)
for file_name, json in metadata_files.items():
Expand Down Expand Up @@ -1098,6 +1077,13 @@ def visit(entity: Entity) -> Mapping[UUID4, L]:

return recurse(self.root_entities().values())

def not_stitched(self, entities: Mapping[UUID, E]) -> list[E]:
return [
entity
for uuid, entity in entities.items()
if str(uuid) not in self.stitched
]

@cached_property
def leaf_cell_suspensions(self) -> Mapping[UUID4, CellSuspension]:
return self.leaf_entities(CellSuspension)
Expand Down
Loading

0 comments on commit 29214a9

Please sign in to comment.