Skip to content

Commit

Permalink
Push down HCA specifics to a Bundle subclass (#4940)
Browse files Browse the repository at this point in the history
  • Loading branch information
nadove-ucsc authored and hannes-ucsc committed May 15, 2023
1 parent f297602 commit 11fe668
Show file tree
Hide file tree
Showing 20 changed files with 1,222 additions and 1,183 deletions.
13 changes: 9 additions & 4 deletions scripts/can_bundle.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,9 @@
from azul.plugins import (
RepositoryPlugin,
)
from azul.plugins.anvil import (
AnvilBundle,
)
from azul.types import (
AnyJSON,
AnyMutableJSON,
Expand Down Expand Up @@ -117,10 +120,12 @@ def save_bundle(bundle: Bundle, output_dir: str) -> None:


def redact_bundle(bundle: Bundle, key: bytes) -> None:
for name in bundle.metadata_files.keys():
entity_type = name.split('_')[0]
if entity_type in redacted_entity_types:
bundle.metadata_files[name] = redact_json(bundle.metadata_files[name], key)
if isinstance(bundle, AnvilBundle):
for entity_ref, entity_metadata in bundle.entities.items():
if entity_ref.entity_type in redacted_entity_types:
bundle.entities[entity_ref] = redact_json(entity_metadata, key)
else:
raise RuntimeError('HCA bundles do not support redaction', type(bundle))


def redact_json(o: AnyJSON, key: bytes) -> AnyMutableJSON:
Expand Down
37 changes: 6 additions & 31 deletions src/azul/indexer/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@
AnyJSON,
JSON,
MutableJSON,
MutableJSONs,
SupportsLessThan,
get_generic_type_params,
)
Expand Down Expand Up @@ -399,23 +398,6 @@ def to_json(self) -> SourcedBundleFQIDJSON:
@attr.s(auto_attribs=True, kw_only=True)
class Bundle(Generic[BUNDLE_FQID], metaclass=ABCMeta):
fqid: BUNDLE_FQID
manifest: MutableJSONs
"""
Each item of the `manifest` attribute's value has this shape:
{
'content-type': 'application/json; dcp-type="metadata/biomaterial"',
'crc32c': 'fd239631',
'indexed': True,
'name': 'cell_suspension_0.json',
's3_etag': 'aa31c093cc816edb1f3a42e577872ec6',
'sha1': 'f413a9a7923dee616309e4f40752859195798a5d',
'sha256': 'ea4c9ed9e53a3aa2ca4b7dffcacb6bbe9108a460e8e15d2b3d5e8e5261fb043e',
'size': 1366,
'uuid': '0136ebb4-1317-42a0-8826-502fae25c29f',
'version': '2019-05-16T162155.020000Z'
}
"""
metadata_files: MutableJSON

@property
def uuid(self) -> BundleUUID:
Expand Down Expand Up @@ -453,26 +435,19 @@ def reject_joiner(self):
"""
Raise a requirement error if the given string is found in the bundle
"""
self._reject_joiner(self.manifest)
self._reject_joiner(self.metadata_files)
self._reject_joiner(self.to_json())

@classmethod
def extension(cls) -> str:
return ''

def to_json(self) -> MutableJSON:
return {
'manifest': self.manifest,
'metadata': self.metadata_files
}

@classmethod
def from_json(cls, fqid: BUNDLE_FQID, json_: JSON) -> 'Bundle':
manifest = json_['manifest']
metadata = json_['metadata']
assert isinstance(manifest, list), manifest
assert isinstance(metadata, dict), metadata
return cls(fqid=fqid, manifest=manifest, metadata_files=metadata)
raise NotImplementedError

@abstractmethod
def to_json(self) -> MutableJSON:
raise NotImplementedError


class BundlePartition(UUIDPartition['BundlePartition']):
Expand Down
5 changes: 5 additions & 0 deletions src/azul/plugins/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -484,6 +484,11 @@ def _bundle_fqid_cls(self) -> Type[BUNDLE_FQID]:
bundle_cls, spec_cls, ref_cls, fqid_cls = self._generic_params
return fqid_cls

@property
def _bundle_cls(self) -> Type[BUNDLE]:
bundle_cls, spec_cls, ref_cls, fqid_cls = self._generic_params
return bundle_cls

def resolve_source(self, spec: str) -> SOURCE_REF:
"""
Return an instance of :class:`SourceRef` for the repository source
Expand Down
104 changes: 104 additions & 0 deletions src/azul/plugins/anvil.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
from abc import (
ABC,
)
from typing import (
AbstractSet,
Generic,
Iterable,
Optional,
TypeVar,
Union,
)

import attr
from more_itertools import (
one,
)

from azul.indexer import (
BUNDLE_FQID,
Bundle,
)
from azul.indexer.document import (
EntityReference,
EntityType,
)
from azul.types import (
JSON,
MutableJSON,
)

# AnVIL snapshots do not use UUIDs for primary/foreign keys.
# This type alias helps us distinguish these keys from the document UUIDs,
# which are drawn from the `datarepo_row_id` column.
# Note that entities from different tables may have the same key, so
# `KeyReference` should be used when mixing keys from different entity types.
Key = str


@attr.s(frozen=True, auto_attribs=True, kw_only=True, slots=True)
class KeyReference:
key: Key
entity_type: EntityType


ENTITY_REF = TypeVar(name='ENTITY_REF', bound=Union[EntityReference, KeyReference])


@attr.s(auto_attribs=True, frozen=True, kw_only=True, order=False)
class Link(Generic[ENTITY_REF]):
inputs: AbstractSet[ENTITY_REF] = attr.ib(factory=frozenset, converter=frozenset)
activity: Optional[ENTITY_REF] = attr.ib(default=None)
outputs: AbstractSet[ENTITY_REF] = attr.ib(factory=frozenset, converter=frozenset)

@property
def all_entities(self) -> AbstractSet[ENTITY_REF]:
return self.inputs | self.outputs | (set() if self.activity is None else {self.activity})

@classmethod
def from_json(cls, link: JSON) -> 'Link':
return cls(inputs=set(map(EntityReference.parse, link['inputs'])),
activity=None if link['activity'] is None else EntityReference.parse(link['activity']),
outputs=set(map(EntityReference.parse, link['outputs'])))

def to_json(self) -> MutableJSON:
return {
'inputs': sorted(map(str, self.inputs)),
'activity': None if self.activity is None else str(self.activity),
'outputs': sorted(map(str, self.outputs))
}

@classmethod
def merge(cls, links: Iterable['Link']) -> 'Link':
return cls(inputs=frozenset.union(*[link.inputs for link in links]),
activity=one({link.activity for link in links}),
outputs=frozenset.union(*[link.outputs for link in links]))

def __lt__(self, other: 'Link') -> bool:
return min(self.inputs) < min(other.inputs)


@attr.s(auto_attribs=True, kw_only=True)
class AnvilBundle(Bundle[BUNDLE_FQID], ABC):
entities: dict[EntityReference, MutableJSON] = attr.ib(factory=dict)
links: set[Link[EntityReference]] = attr.ib(factory=set)

def to_json(self) -> MutableJSON:
return {
'entities': {
str(entity_ref): entity
for entity_ref, entity in sorted(self.entities.items())
},
'links': [link.to_json() for link in sorted(self.links)]
}

@classmethod
def from_json(cls, fqid: BUNDLE_FQID, json_: JSON) -> 'AnvilBundle':
return cls(
fqid=fqid,
entities={
EntityReference.parse(entity_ref): entity
for entity_ref, entity in json_['entities'].items()
},
links=set(map(Link.from_json, json_['links']))
)
53 changes: 53 additions & 0 deletions src/azul/plugins/hca.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
from abc import (
ABC,
)
import logging

import attr

from azul.indexer import (
BUNDLE_FQID,
Bundle,
)
from azul.types import (
JSON,
MutableJSON,
MutableJSONs,
)

log = logging.getLogger(__name__)


@attr.s(auto_attribs=True, kw_only=True)
class HCABundle(Bundle[BUNDLE_FQID], ABC):
manifest: MutableJSONs
"""
Each item of the `manifest` attribute's value has this shape:
{
'content-type': 'application/json; dcp-type="metadata/biomaterial"',
'crc32c': 'fd239631',
'indexed': True,
'name': 'cell_suspension_0.json',
's3_etag': 'aa31c093cc816edb1f3a42e577872ec6',
'sha1': 'f413a9a7923dee616309e4f40752859195798a5d',
'sha256': 'ea4c9ed9e53a3aa2ca4b7dffcacb6bbe9108a460e8e15d2b3d5e8e5261fb043e',
'size': 1366,
'uuid': '0136ebb4-1317-42a0-8826-502fae25c29f',
'version': '2019-05-16T162155.020000Z'
}
"""
metadata_files: MutableJSON

def to_json(self) -> MutableJSON:
return {
'manifest': self.manifest,
'metadata': self.metadata_files
}

@classmethod
def from_json(cls, fqid: BUNDLE_FQID, json_: JSON) -> 'Bundle':
manifest = json_['manifest']
metadata = json_['metadata']
assert isinstance(manifest, list), manifest
assert isinstance(metadata, dict), metadata
return cls(fqid=fqid, manifest=manifest, metadata_files=metadata)
10 changes: 5 additions & 5 deletions src/azul/plugins/metadata/anvil/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,15 +6,15 @@
Type,
)

from azul.indexer import (
Bundle,
)
from azul.plugins import (
DocumentSlice,
ManifestConfig,
MetadataPlugin,
Sorting,
)
from azul.plugins.anvil import (
AnvilBundle,
)
from azul.plugins.metadata.anvil.indexer.transform import (
ActivityTransformer,
BaseTransformer,
Expand Down Expand Up @@ -42,7 +42,7 @@
)


class Plugin(MetadataPlugin[Bundle]):
class Plugin(MetadataPlugin[AnvilBundle]):

@classmethod
def atlas(cls) -> str:
Expand Down Expand Up @@ -71,7 +71,7 @@ def transformer_types(self) -> Iterable[Type[BaseTransformer]]:
FileTransformer,
)

def transformers(self, bundle: Bundle, *, delete: bool) -> Iterable[BaseTransformer]:
def transformers(self, bundle: AnvilBundle, *, delete: bool) -> Iterable[BaseTransformer]:
return [
transformer_cls(bundle=bundle, deleted=delete)
for transformer_cls in self.transformer_types()
Expand Down
Loading

0 comments on commit 11fe668

Please sign in to comment.