From f5f5caf0836bb4f7347b0fd80348e6b8224c1c36 Mon Sep 17 00:00:00 2001 From: Conrad Date: Fri, 20 Feb 2026 09:38:33 -0500 Subject: [PATCH 1/3] docs: Add full ENCODE TSV-to-CFDB field mapping to module docstring Document the metadata URL and every TSV column mapped in transform_to_c2m2, grouped by document level (File, Collection, Biosample, Subject, DCC) with enriched subsections for extra fields. --- src/cfdb/services/encode.py | 100 +++++++++++++++++++++++++++++++++++- 1 file changed, 99 insertions(+), 1 deletion(-) diff --git a/src/cfdb/services/encode.py b/src/cfdb/services/encode.py index d99dc26..5df135e 100644 --- a/src/cfdb/services/encode.py +++ b/src/cfdb/services/encode.py @@ -1,4 +1,102 @@ -"""ENCODE metadata TSV client and C2M2 transformation service.""" +"""ENCODE metadata TSV client and CFDB transformation service. + +Fetches the released-experiment metadata TSV from ENCODE and transforms +each row into a CFDB file document. + +Metadata URL +------------ +https://www.encodeproject.org/metadata/?type=Experiment&status=released + +Field Mapping (ENCODE TSV → CFDB) +---------------------------------- + +File +~~~~ +File accession → local_id +File download URL → access_url, filename (derived) +File format → file_format (EDAM-mapped) +Output type → data_type (EDAM-mapped), output_type +Assay → assay_type (OBI-mapped) +Size → size_in_bytes +md5sum → md5 +File Status → status +Experiment date released → creation_time +File accession → persistent_id (derived URL) +File assembly → genome_assembly +Genome annotation → genome_annotation +File format type → output_type_detail +Biological replicate(s) → biological_replicates +Technical replicate(s) → technical_replicates + +Enriched File +~~~~~~~~~~~~~ +Read length → extra.encode.read_length +Mapped read length → extra.encode.mapped_read_length +Run type → extra.encode.run_type +Paired end → extra.encode.paired_end +Paired with → extra.encode.paired_with +Index of → extra.encode.index_of +Derived from → extra.encode.derived_from +Controlled by → extra.encode.controlled_by +s3_uri → extra.encode.s3_uri +Azure URL → extra.encode.azure_url +File analysis title → extra.encode.file_analysis_title +File analysis status → extra.encode.file_analysis_status +Audit WARNING → extra.encode.audit_warning +Audit NOT_COMPLIANT → extra.encode.audit_not_compliant +Audit ERROR → extra.encode.audit_error + +Collection +~~~~~~~~~~ +Experiment accession → collections[].local_id, name, persistent_id +Lab → collections[].lab +Assay → collections[].experiment_type +Experiment target → collections[].experiment_target +Library made from → collections[].analyte_class + +Enriched Collection +~~~~~~~~~~~~~~~~~~~ +Project → collections[].extra.encode.project +Platform → collections[].extra.encode.platform +dbxrefs → collections[].extra.encode.dbxrefs +RBNS protein concentration → collections[].extra.encode.rbns_protein_concentration + +Biosample +~~~~~~~~~ +Biosample term name → collections[].biosamples[].local_id +Biosample term id / term name → collections[].biosamples[].anatomy + +Enriched Biosample +~~~~~~~~~~~~~~~~~~ +Biosample type → …biosamples[].extra.encode.biosample_type +Biosample treatments → …biosamples[].extra.encode.biosample_treatments +Biosample treatments amount → …biosamples[].extra.encode.biosample_treatments_amount +Biosample treatments duration → …biosamples[].extra.encode.biosample_treatments_duration +Biosample genetic mods (*) → …biosamples[].extra.encode.biosample_genetic_modifications +Library made from → …biosamples[].extra.encode.library_made_from +Library depleted in → …biosamples[].extra.encode.library_depleted_in +Library extraction method → …biosamples[].extra.encode.library_extraction_method +Library lysis method → …biosamples[].extra.encode.library_lysis_method +Library crosslinking method → …biosamples[].extra.encode.library_crosslinking_method +Library strand specific → …biosamples[].extra.encode.library_strand_specific +Library fragmentation method → …biosamples[].extra.encode.library_fragmentation_method +Library size range → …biosamples[].extra.encode.library_size_range + +(*) Full TSV column: "Biosample genetic modifications methods/categories/ + targets/gene targets/site coordinates/zygosity" + +Subject +~~~~~~~ +Donor(s) → collections[].biosamples[].subjects[].local_id +Biosample organism → collections[].biosamples[].subjects[].taxonomy + +DCC +~~~ +Static / config-derived: dcc.id, dcc.dcc_name, dcc.dcc_abbreviation, + dcc.dcc_description, dcc.contact_email, + dcc.contact_name, dcc.dcc_url, + dcc.project_id_namespace, dcc.project_local_id +""" import logging import re From ced7e63d391792ee753a0bb050abd63960fe793a Mon Sep 17 00:00:00 2001 From: Conrad Date: Fri, 20 Feb 2026 09:45:43 -0500 Subject: [PATCH 2/3] docs: Add full 4DN API-to-CFDB field mapping to module docstring Document the Search API URLs, entity matching patterns, and every API field mapped in the file and collection enrichment passes, grouped by document level with enriched subsections for DCC-specific fields. --- src/cfdb/services/fourdn.py | 79 ++++++++++++++++++++++++++++++++++++- 1 file changed, 78 insertions(+), 1 deletion(-) diff --git a/src/cfdb/services/fourdn.py b/src/cfdb/services/fourdn.py index 857353c..e715102 100644 --- a/src/cfdb/services/fourdn.py +++ b/src/cfdb/services/fourdn.py @@ -1,4 +1,81 @@ -"""4DN Search API client for bulk file metadata enrichment.""" +"""4DN Search API client for bulk file metadata enrichment. + +Fetches file, experiment, and biosource metadata from the 4DN Search API +to enrich C2M2-materialized documents. Two enrichment passes run during +sync: collection enrichment (pre-materialization) and file enrichment +(post-materialization). + +API URLs +-------- +File metadata: + https://data.4dnucleome.org/search/?type=FileProcessed + https://data.4dnucleome.org/search/?type=FileFastq +Experiment metadata: + https://data.4dnucleome.org/search/?type=ExperimentHiC + https://data.4dnucleome.org/search/?type=ExperimentSeq + https://data.4dnucleome.org/search/?type=ExperimentDamid + https://data.4dnucleome.org/search/?type=ExperimentChiapet +Biosource tiers: + https://data.4dnucleome.org/search/?type=Biosource&cell_line_tier=Tier+1 + https://data.4dnucleome.org/search/?type=Biosource&cell_line_tier=Tier+2 + +Entity Matching +--------------- +File persistent_id contains 4DNF[A-Z0-9]+ accession +Collection persistent_id contains 4DNE[A-Z][A-Z0-9]+ accession + +Field Mapping (4DN API → CFDB) +------------------------------- + +File (post-materialization) +~~~~~~~~~~~~~~~~~~~~~~~~~~~ +genome_assembly → genome_assembly +file_type → output_type +file_type_detailed → output_type_detail +track_and_facet_info.condition → condition +track_and_facet_info.assay_info → assay_info +track_and_facet_info.replicate_info → biological_replicates (parsed), + technical_replicates (parsed) + +Enriched File +~~~~~~~~~~~~~ +track_and_facet_info.replicate_info → extra.replicate_info +track_and_facet_info.biosource_name → extra.fourdn.biosource_name +track_and_facet_info.dataset → extra.fourdn.dataset +extra_files[] → extra.fourdn.extra_files + .href .href + .md5sum .md5sum + .file_size .file_size + .file_format .file_format +Biosource.cell_line_tier (derived) → extra.fourdn.cell_line_tier + +Collection (pre-materialization) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +lab.display_title → collections[].lab +experiment_type.display_title → collections[].experiment_type + +Enriched Collection +~~~~~~~~~~~~~~~~~~~ +display_title → collections[].extra.fourdn.display_title +digestion_enzyme.display_title → collections[].extra.fourdn.digestion_enzyme +targeted_factor[].display_title → collections[].extra.fourdn.targeted_factor +crosslinking_method → collections[].extra.fourdn.crosslinking_method +crosslinking_temperature → collections[].extra.fourdn.crosslinking_temperature +crosslinking_time → collections[].extra.fourdn.crosslinking_time +ligation_temperature → collections[].extra.fourdn.ligation_temperature +ligation_volume → collections[].extra.fourdn.ligation_volume +ligation_time → collections[].extra.fourdn.ligation_time +digestion_temperature → collections[].extra.fourdn.digestion_temperature +digestion_time → collections[].extra.fourdn.digestion_time +tagging_method → collections[].extra.fourdn.tagging_method +fragmentation_method → collections[].extra.fourdn.fragmentation_method +biotin_removed → collections[].extra.fourdn.biotin_removed +library_prep_kit → collections[].extra.fourdn.library_prep_kit +average_fragment_size → collections[].extra.fourdn.average_fragment_size +fragment_size_range → collections[].extra.fourdn.fragment_size_range +status → collections[].extra.fourdn.status +date_created → collections[].extra.fourdn.date_created +""" import asyncio import logging From 100904a2758d91dd6660669f12bfe41b8a9aaf5d Mon Sep 17 00:00:00 2001 From: Conrad Date: Fri, 20 Feb 2026 09:54:47 -0500 Subject: [PATCH 3/3] docs: Add full HuBMAP API-to-CFDB field mapping to module docstring Document the Search API URLs, entity matching strategy, and every API field mapped across the file, collection, and subject enrichment passes, grouped by document level with enriched subsections for DCC-specific fields. --- src/cfdb/services/hubmap.py | 66 ++++++++++++++++++++++++++++++++++++- 1 file changed, 65 insertions(+), 1 deletion(-) diff --git a/src/cfdb/services/hubmap.py b/src/cfdb/services/hubmap.py index 9392db1..0040dda 100644 --- a/src/cfdb/services/hubmap.py +++ b/src/cfdb/services/hubmap.py @@ -1,4 +1,68 @@ -"""HuBMAP Search API integration for access level metadata and enrichment.""" +"""HuBMAP Search API integration for access level metadata and enrichment. + +Fetches dataset, donor, and file metadata from the HuBMAP Search API +(Elasticsearch-backed) to enrich C2M2-materialized documents. Three +enrichment targets run during sync: collections and subjects +(pre-materialization) and files (post-materialization). + +API URLs +-------- +Bulk dataset search (search_after pagination): + https://search.api.hubmapconsortium.org/v3/portal/search +Entity lookup (single UUID): + https://search.api.hubmapconsortium.org/v3/entities/{uuid} + +Entity Matching +--------------- +Collection persistent_id matches doi_url +Subject local_id contains donor uuid +File matched via collection doi_url → dataset, then filename + +Field Mapping (HuBMAP Search API → CFDB) +------------------------------------------ + +File (post-materialization) +~~~~~~~~~~~~~~~~~~~~~~~~~~~ +data_access_level → data_access_level +ingest_metadata.workflow_description → genome_assembly (regex-derived) + +Enriched File +~~~~~~~~~~~~~ +files[].rel_path → extra.hubmap.rel_path +files[].is_data_product → extra.hubmap.is_data_product + +Collection (pre-materialization) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +dataset_type → collections[].experiment_type +analyte_class → collections[].analyte_class + +Enriched Collection +~~~~~~~~~~~~~~~~~~~ +pipeline → collections[].extra.hubmap.pipeline +processing → collections[].extra.hubmap.processing +group_name → collections[].extra.hubmap.group_name +visualization → collections[].extra.hubmap.visualization +vitessce-hints → collections[].extra.hubmap.vitessce_hints +metadata → collections[].extra.hubmap.metadata + +Enriched Subject (pre-materialization, from donor.mapped_metadata) +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ +sex → subjects[].extra.hubmap.sex +race → subjects[].extra.hubmap.race +age_value → subjects[].extra.hubmap.age_value +age_unit → subjects[].extra.hubmap.age_unit +height_value → subjects[].extra.hubmap.height_value +height_unit → subjects[].extra.hubmap.height_unit +weight_value → subjects[].extra.hubmap.weight_value +weight_unit → subjects[].extra.hubmap.weight_unit +body_mass_index_value → subjects[].extra.hubmap.body_mass_index_value +body_mass_index_unit → subjects[].extra.hubmap.body_mass_index_unit +cause_of_death → subjects[].extra.hubmap.cause_of_death +death_event → subjects[].extra.hubmap.death_event +mechanism_of_injury → subjects[].extra.hubmap.mechanism_of_injury +medical_history → subjects[].extra.hubmap.medical_history +social_history → subjects[].extra.hubmap.social_history +""" import asyncio import logging