diff --git a/src/cfdb/services/encode.py b/src/cfdb/services/encode.py index d99dc26..5df135e 100644 --- a/src/cfdb/services/encode.py +++ b/src/cfdb/services/encode.py @@ -1,4 +1,102 @@ -"""ENCODE metadata TSV client and C2M2 transformation service.""" +"""ENCODE metadata TSV client and CFDB transformation service. + +Fetches the released-experiment metadata TSV from ENCODE and transforms +each row into a CFDB file document. + +Metadata URL +------------ +https://www.encodeproject.org/metadata/?type=Experiment&status=released + +Field Mapping (ENCODE TSV → CFDB) +---------------------------------- + +File +~~~~ +File accession → local_id +File download URL → access_url, filename (derived) +File format → file_format (EDAM-mapped) +Output type → data_type (EDAM-mapped), output_type +Assay → assay_type (OBI-mapped) +Size → size_in_bytes +md5sum → md5 +File Status → status +Experiment date released → creation_time +File accession → persistent_id (derived URL) +File assembly → genome_assembly +Genome annotation → genome_annotation +File format type → output_type_detail +Biological replicate(s) → biological_replicates +Technical replicate(s) → technical_replicates + +Enriched File +~~~~~~~~~~~~~ +Read length → extra.encode.read_length +Mapped read length → extra.encode.mapped_read_length +Run type → extra.encode.run_type +Paired end → extra.encode.paired_end +Paired with → extra.encode.paired_with +Index of → extra.encode.index_of +Derived from → extra.encode.derived_from +Controlled by → extra.encode.controlled_by +s3_uri → extra.encode.s3_uri +Azure URL → extra.encode.azure_url +File analysis title → extra.encode.file_analysis_title +File analysis status → extra.encode.file_analysis_status +Audit WARNING → extra.encode.audit_warning +Audit NOT_COMPLIANT → extra.encode.audit_not_compliant +Audit ERROR → extra.encode.audit_error + +Collection +~~~~~~~~~~ +Experiment accession → collections[].local_id, name, persistent_id +Lab → collections[].lab +Assay → collections[].experiment_type +Experiment target → collections[].experiment_target +Library made from → collections[].analyte_class + +Enriched Collection +~~~~~~~~~~~~~~~~~~~ +Project → collections[].extra.encode.project +Platform → collections[].extra.encode.platform +dbxrefs → collections[].extra.encode.dbxrefs +RBNS protein concentration → collections[].extra.encode.rbns_protein_concentration + +Biosample +~~~~~~~~~ +Biosample term name → collections[].biosamples[].local_id +Biosample term id / term name → collections[].biosamples[].anatomy + +Enriched Biosample +~~~~~~~~~~~~~~~~~~ +Biosample type → …biosamples[].extra.encode.biosample_type +Biosample treatments → …biosamples[].extra.encode.biosample_treatments +Biosample treatments amount → …biosamples[].extra.encode.biosample_treatments_amount +Biosample treatments duration → …biosamples[].extra.encode.biosample_treatments_duration +Biosample genetic mods (*) → …biosamples[].extra.encode.biosample_genetic_modifications +Library made from → …biosamples[].extra.encode.library_made_from +Library depleted in → …biosamples[].extra.encode.library_depleted_in +Library extraction method → …biosamples[].extra.encode.library_extraction_method +Library lysis method → …biosamples[].extra.encode.library_lysis_method +Library crosslinking method → …biosamples[].extra.encode.library_crosslinking_method +Library strand specific → …biosamples[].extra.encode.library_strand_specific +Library fragmentation method → …biosamples[].extra.encode.library_fragmentation_method +Library size range → …biosamples[].extra.encode.library_size_range + +(*) Full TSV column: "Biosample genetic modifications methods/categories/ + targets/gene targets/site coordinates/zygosity" + +Subject +~~~~~~~ +Donor(s) → collections[].biosamples[].subjects[].local_id +Biosample organism → collections[].biosamples[].subjects[].taxonomy + +DCC +~~~ +Static / config-derived: dcc.id, dcc.dcc_name, dcc.dcc_abbreviation, + dcc.dcc_description, dcc.contact_email, + dcc.contact_name, dcc.dcc_url, + dcc.project_id_namespace, dcc.project_local_id +""" import logging import re diff --git a/src/cfdb/services/fourdn.py b/src/cfdb/services/fourdn.py index 857353c..e715102 100644 --- a/src/cfdb/services/fourdn.py +++ b/src/cfdb/services/fourdn.py @@ -1,4 +1,81 @@ -"""4DN Search API client for bulk file metadata enrichment.""" +"""4DN Search API client for bulk file metadata enrichment. + +Fetches file, experiment, and biosource metadata from the 4DN Search API +to enrich C2M2-materialized documents. Two enrichment passes run during +sync: collection enrichment (pre-materialization) and file enrichment +(post-materialization). + +API URLs +-------- +File metadata: + https://data.4dnucleome.org/search/?type=FileProcessed + https://data.4dnucleome.org/search/?type=FileFastq +Experiment metadata: + https://data.4dnucleome.org/search/?type=ExperimentHiC + https://data.4dnucleome.org/search/?type=ExperimentSeq + https://data.4dnucleome.org/search/?type=ExperimentDamid + https://data.4dnucleome.org/search/?type=ExperimentChiapet +Biosource tiers: + https://data.4dnucleome.org/search/?type=Biosource&cell_line_tier=Tier+1 + https://data.4dnucleome.org/search/?type=Biosource&cell_line_tier=Tier+2 + +Entity Matching +--------------- +File persistent_id contains 4DNF[A-Z0-9]+ accession +Collection persistent_id contains 4DNE[A-Z][A-Z0-9]+ accession + +Field Mapping (4DN API → CFDB) +------------------------------- + +File (post-materialization) +~~~~~~~~~~~~~~~~~~~~~~~~~~~ +genome_assembly → genome_assembly +file_type → output_type +file_type_detailed → output_type_detail +track_and_facet_info.condition → condition +track_and_facet_info.assay_info → assay_info +track_and_facet_info.replicate_info → biological_replicates (parsed), + technical_replicates (parsed) + +Enriched File +~~~~~~~~~~~~~ +track_and_facet_info.replicate_info → extra.replicate_info +track_and_facet_info.biosource_name → extra.fourdn.biosource_name +track_and_facet_info.dataset → extra.fourdn.dataset +extra_files[] → extra.fourdn.extra_files + .href .href + .md5sum .md5sum + .file_size .file_size + .file_format .file_format +Biosource.cell_line_tier (derived) → extra.fourdn.cell_line_tier + +Collection (pre-materialization) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +lab.display_title → collections[].lab +experiment_type.display_title → collections[].experiment_type + +Enriched Collection +~~~~~~~~~~~~~~~~~~~ +display_title → collections[].extra.fourdn.display_title +digestion_enzyme.display_title → collections[].extra.fourdn.digestion_enzyme +targeted_factor[].display_title → collections[].extra.fourdn.targeted_factor +crosslinking_method → collections[].extra.fourdn.crosslinking_method +crosslinking_temperature → collections[].extra.fourdn.crosslinking_temperature +crosslinking_time → collections[].extra.fourdn.crosslinking_time +ligation_temperature → collections[].extra.fourdn.ligation_temperature +ligation_volume → collections[].extra.fourdn.ligation_volume +ligation_time → collections[].extra.fourdn.ligation_time +digestion_temperature → collections[].extra.fourdn.digestion_temperature +digestion_time → collections[].extra.fourdn.digestion_time +tagging_method → collections[].extra.fourdn.tagging_method +fragmentation_method → collections[].extra.fourdn.fragmentation_method +biotin_removed → collections[].extra.fourdn.biotin_removed +library_prep_kit → collections[].extra.fourdn.library_prep_kit +average_fragment_size → collections[].extra.fourdn.average_fragment_size +fragment_size_range → collections[].extra.fourdn.fragment_size_range +status → collections[].extra.fourdn.status +date_created → collections[].extra.fourdn.date_created +""" import asyncio import logging diff --git a/src/cfdb/services/hubmap.py b/src/cfdb/services/hubmap.py index 9392db1..0040dda 100644 --- a/src/cfdb/services/hubmap.py +++ b/src/cfdb/services/hubmap.py @@ -1,4 +1,68 @@ -"""HuBMAP Search API integration for access level metadata and enrichment.""" +"""HuBMAP Search API integration for access level metadata and enrichment. + +Fetches dataset, donor, and file metadata from the HuBMAP Search API +(Elasticsearch-backed) to enrich C2M2-materialized documents. Three +enrichment targets run during sync: collections and subjects +(pre-materialization) and files (post-materialization). + +API URLs +-------- +Bulk dataset search (search_after pagination): + https://search.api.hubmapconsortium.org/v3/portal/search +Entity lookup (single UUID): + https://search.api.hubmapconsortium.org/v3/entities/{uuid} + +Entity Matching +--------------- +Collection persistent_id matches doi_url +Subject local_id contains donor uuid +File matched via collection doi_url → dataset, then filename + +Field Mapping (HuBMAP Search API → CFDB) +------------------------------------------ + +File (post-materialization) +~~~~~~~~~~~~~~~~~~~~~~~~~~~ +data_access_level → data_access_level +ingest_metadata.workflow_description → genome_assembly (regex-derived) + +Enriched File +~~~~~~~~~~~~~ +files[].rel_path → extra.hubmap.rel_path +files[].is_data_product → extra.hubmap.is_data_product + +Collection (pre-materialization) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +dataset_type → collections[].experiment_type +analyte_class → collections[].analyte_class + +Enriched Collection +~~~~~~~~~~~~~~~~~~~ +pipeline → collections[].extra.hubmap.pipeline +processing → collections[].extra.hubmap.processing +group_name → collections[].extra.hubmap.group_name +visualization → collections[].extra.hubmap.visualization +vitessce-hints → collections[].extra.hubmap.vitessce_hints +metadata → collections[].extra.hubmap.metadata + +Enriched Subject (pre-materialization, from donor.mapped_metadata) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +sex → subjects[].extra.hubmap.sex +race → subjects[].extra.hubmap.race +age_value → subjects[].extra.hubmap.age_value +age_unit → subjects[].extra.hubmap.age_unit +height_value → subjects[].extra.hubmap.height_value +height_unit → subjects[].extra.hubmap.height_unit +weight_value → subjects[].extra.hubmap.weight_value +weight_unit → subjects[].extra.hubmap.weight_unit +body_mass_index_value → subjects[].extra.hubmap.body_mass_index_value +body_mass_index_unit → subjects[].extra.hubmap.body_mass_index_unit +cause_of_death → subjects[].extra.hubmap.cause_of_death +death_event → subjects[].extra.hubmap.death_event +mechanism_of_injury → subjects[].extra.hubmap.mechanism_of_injury +medical_history → subjects[].extra.hubmap.medical_history +social_history → subjects[].extra.hubmap.social_history +""" import asyncio import logging