Skip to content

Commit

Permalink
Push down HCA specifics to a Bundle subclass (#4940)
Browse files Browse the repository at this point in the history
  • Loading branch information
nadove-ucsc committed Apr 19, 2023
1 parent 35dc241 commit e2cd04f
Show file tree
Hide file tree
Showing 13 changed files with 336 additions and 283 deletions.
27 changes: 9 additions & 18 deletions src/azul/indexer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@
from azul.types import (
JSON,
MutableJSON,
MutableJSONs,
SupportsLessThan,
get_generic_type_params,
)
Expand Down Expand Up @@ -396,23 +395,6 @@ def to_json(self) -> SourcedBundleFQIDJSON:
@attr.s(auto_attribs=True, kw_only=True)
class Bundle(ABC, Generic[BUNDLE_FQID]):
fqid: BUNDLE_FQID
manifest: MutableJSONs
"""
Each item of the `manifest` attribute's value has this shape:
{
'content-type': 'application/json; dcp-type="metadata/biomaterial"',
'crc32c': 'fd239631',
'indexed': True,
'name': 'cell_suspension_0.json',
's3_etag': 'aa31c093cc816edb1f3a42e577872ec6',
'sha1': 'f413a9a7923dee616309e4f40752859195798a5d',
'sha256': 'ea4c9ed9e53a3aa2ca4b7dffcacb6bbe9108a460e8e15d2b3d5e8e5261fb043e',
'size': 1366,
'uuid': '0136ebb4-1317-42a0-8826-502fae25c29f',
'version': '2019-05-16T162155.020000Z'
}
"""
metadata_files: MutableJSON

@property
def uuid(self) -> BundleUUID:
Expand All @@ -432,6 +414,15 @@ def drs_path(self, manifest_entry: JSON) -> Optional[str]:
"""
raise NotImplementedError

@abstractmethod
def to_json(self) -> MutableJSON:
raise NotImplementedError

@classmethod
@abstractmethod
def from_json(cls, fqid: BUNDLE_FQID, json: JSON) -> 'Bundle':
raise NotImplementedError


class BundlePartition(UUIDPartition['BundlePartition']):
"""
Expand Down
106 changes: 106 additions & 0 deletions src/azul/plugins/anvil.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from abc import (
ABC,
)
from typing import (
AbstractSet,
Generic,
Iterable,
Optional,
TypeVar,
Union,
)

import attr
from more_itertools import (
one,
)

from azul.indexer import (
BUNDLE_FQID,
Bundle,
)
from azul.indexer.document import (
EntityReference,
EntityType,
)
from azul.types import (
JSON,
MutableJSON,
)

# AnVIL snapshots do not use UUIDs for primary/foreign keys.
# This type alias helps us distinguish these keys from the document UUIDs,
# which are drawn from the `datarepo_row_id` column.
# Note that entities from different tables may have the same key, so
# `KeyReference` should be used when mixing keys from different entity types.
Key = str


@attr.s(frozen=True, auto_attribs=True, kw_only=True, slots=True)
class KeyReference:
key: Key
entity_type: EntityType


ENTITY_REF = TypeVar(name='ENTITY_REF', bound=Union[EntityReference, KeyReference])


@attr.s(auto_attribs=True, frozen=True, kw_only=True, order=False)
class Link(Generic[ENTITY_REF]):
inputs: AbstractSet[ENTITY_REF] = attr.ib(factory=frozenset)
activity: Optional[ENTITY_REF] = attr.ib(default=None)
outputs: AbstractSet[ENTITY_REF] = attr.ib(factory=frozenset)

@property
def all_entities(self) -> AbstractSet[ENTITY_REF]:
return self.inputs | self.outputs | (set() if self.activity is None else {self.activity})

def to_json(self) -> MutableJSON:
return {
'inputs': sorted(map(str, self.inputs)),
'activity': None if self.activity is None else str(self.activity),
'outputs': sorted(map(str, self.outputs))
}

@classmethod
def merge(cls, links: Iterable['Link']) -> 'Link':
return cls(inputs=frozenset.union(*[link.inputs for link in links]),
activity=one({link.activity for link in links}),
outputs=frozenset.union(*[link.outputs for link in links]))

def __lt__(self, other: 'Link') -> bool:
if self.activity is None or other.activity is None:
return False
else:
return self.activity < other.activity


@attr.s(auto_attribs=True, kw_only=True)
class AnvilBundle(Bundle[BUNDLE_FQID], ABC):
entities: dict[EntityReference, MutableJSON] = attr.ib(factory=dict)
links: set[Link[EntityReference]] = attr.ib(factory=set)

def to_json(self) -> MutableJSON:
return {
'entities': {
str(entity_ref): entity
for entity_ref, entity in sorted(self.entities.items())
},
'links': [link.to_json() for link in sorted(self.links)]
}

@classmethod
def from_json(cls, fqid: BUNDLE_FQID, json: JSON) -> 'Bundle':
return cls(
fqid=fqid,
entities={
EntityReference.parse(entity_ref): entity
for entity_ref, entity in json['entities'].items()
},
links={
Link(inputs=set(map(EntityReference.parse, link['inputs'])),
activity=None if link['activity'] is None else EntityReference.parse(link['activity']),
outputs=set(map(EntityReference.parse, link['outputs'])))
for link in json['links']
}
)
50 changes: 50 additions & 0 deletions src/azul/plugins/hca.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
from abc import (
ABC,
)

import attr

from azul.indexer import (
BUNDLE_FQID,
Bundle,
)
from azul.types import (
JSON,
MutableJSON,
MutableJSONs,
)


@attr.s(auto_attribs=True, kw_only=True)
class HCABundle(Bundle[BUNDLE_FQID], ABC):
manifest: MutableJSONs
"""
Each item of the `manifest` attribute's value has this shape:
{
'content-type': 'application/json; dcp-type="metadata/biomaterial"',
'crc32c': 'fd239631',
'indexed': True,
'name': 'cell_suspension_0.json',
's3_etag': 'aa31c093cc816edb1f3a42e577872ec6',
'sha1': 'f413a9a7923dee616309e4f40752859195798a5d',
'sha256': 'ea4c9ed9e53a3aa2ca4b7dffcacb6bbe9108a460e8e15d2b3d5e8e5261fb043e',
'size': 1366,
'uuid': '0136ebb4-1317-42a0-8826-502fae25c29f',
'version': '2019-05-16T162155.020000Z'
}
"""
metadata_files: MutableJSON

def to_json(self) -> MutableJSON:
return {
'manifest': self.manifest,
'metadata': self.metadata_files
}

@classmethod
def from_json(cls, fqid: BUNDLE_FQID, json: JSON) -> 'HCABundle':
return cls(
fqid=fqid,
manifest=json['manifest'],
metadata_files=json['metadata']
)
10 changes: 5 additions & 5 deletions src/azul/plugins/metadata/anvil/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@
Type,
)

from azul.indexer import (
Bundle,
)
from azul.plugins import (
DocumentSlice,
ManifestConfig,
MetadataPlugin,
Sorting,
)
from azul.plugins.anvil import (
AnvilBundle,
)
from azul.plugins.metadata.anvil.indexer.transform import (
ActivityTransformer,
BaseTransformer,
Expand Down Expand Up @@ -42,7 +42,7 @@
)


class Plugin(MetadataPlugin[Bundle]):
class Plugin(MetadataPlugin[AnvilBundle]):

@classmethod
def atlas(cls) -> str:
Expand Down Expand Up @@ -71,7 +71,7 @@ def transformer_types(self) -> Iterable[Type[BaseTransformer]]:
FileTransformer,
)

def transformers(self, bundle: Bundle, *, delete: bool) -> Iterable[BaseTransformer]:
def transformers(self, bundle: AnvilBundle, *, delete: bool) -> Iterable[BaseTransformer]:
return [
transformer_cls(bundle=bundle, deleted=delete)
for transformer_cls in self.transformer_types()
Expand Down
Loading

0 comments on commit e2cd04f

Please sign in to comment.