abdenlab · conradbzura · Feb 20, 2026 · Feb 20, 2026 · Feb 20, 2026 · nvictus
diff --git a/src/cfdb/services/encode.py b/src/cfdb/services/encode.py
@@ -1,4 +1,102 @@
-"""ENCODE metadata TSV client and C2M2 transformation service."""
+"""ENCODE metadata TSV client and CFDB transformation service.
+
+Fetches the released-experiment metadata TSV from ENCODE and transforms
+each row into a CFDB file document.
+
+Metadata URL
+------------
+https://www.encodeproject.org/metadata/?type=Experiment&status=released
+
+Field Mapping (ENCODE TSV → CFDB)
+----------------------------------
+
+File
+~~~~
+File accession                  → local_id
+File download URL               → access_url, filename (derived)
+File format                     → file_format (EDAM-mapped)
+Output type                     → data_type (EDAM-mapped), output_type
+Assay                           → assay_type (OBI-mapped)
+Size                            → size_in_bytes
+md5sum                          → md5
+File Status                     → status
+Experiment date released        → creation_time
+File accession                  → persistent_id (derived URL)
+File assembly                   → genome_assembly
+Genome annotation               → genome_annotation
+File format type                → output_type_detail
+Biological replicate(s)         → biological_replicates
+Technical replicate(s)          → technical_replicates
+
+Enriched File
+~~~~~~~~~~~~~
+Read length                     → extra.encode.read_length
+Mapped read length              → extra.encode.mapped_read_length
+Run type                        → extra.encode.run_type
+Paired end                      → extra.encode.paired_end
+Paired with                     → extra.encode.paired_with
+Index of                        → extra.encode.index_of
+Derived from                    → extra.encode.derived_from
+Controlled by                   → extra.encode.controlled_by
+s3_uri                          → extra.encode.s3_uri
+Azure URL                       → extra.encode.azure_url
+File analysis title             → extra.encode.file_analysis_title
+File analysis status            → extra.encode.file_analysis_status
+Audit WARNING                   → extra.encode.audit_warning
+Audit NOT_COMPLIANT             → extra.encode.audit_not_compliant
+Audit ERROR                     → extra.encode.audit_error
+
+Collection
+~~~~~~~~~~
+Experiment accession            → collections[].local_id, name, persistent_id
+Lab                             → collections[].lab
+Assay                           → collections[].experiment_type
+Experiment target               → collections[].experiment_target
+Library made from               → collections[].analyte_class
+
+Enriched Collection
+~~~~~~~~~~~~~~~~~~~
+Project                         → collections[].extra.encode.project
+Platform                        → collections[].extra.encode.platform
+dbxrefs                         → collections[].extra.encode.dbxrefs
+RBNS protein concentration      → collections[].extra.encode.rbns_protein_concentration
+
+Biosample
+~~~~~~~~~
+Biosample term name             → collections[].biosamples[].local_id
+Biosample term id / term name   → collections[].biosamples[].anatomy
+
+Enriched Biosample
+~~~~~~~~~~~~~~~~~~
+Biosample type                  → …biosamples[].extra.encode.biosample_type
+Biosample treatments            → …biosamples[].extra.encode.biosample_treatments
+Biosample treatments amount     → …biosamples[].extra.encode.biosample_treatments_amount
+Biosample treatments duration   → …biosamples[].extra.encode.biosample_treatments_duration
+Biosample genetic mods (*)      → …biosamples[].extra.encode.biosample_genetic_modifications
+Library made from               → …biosamples[].extra.encode.library_made_from
+Library depleted in             → …biosamples[].extra.encode.library_depleted_in
+Library extraction method       → …biosamples[].extra.encode.library_extraction_method
+Library lysis method            → …biosamples[].extra.encode.library_lysis_method
+Library crosslinking method     → …biosamples[].extra.encode.library_crosslinking_method
+Library strand specific         → …biosamples[].extra.encode.library_strand_specific
+Library fragmentation method    → …biosamples[].extra.encode.library_fragmentation_method
+Library size range              → …biosamples[].extra.encode.library_size_range
+
+(*) Full TSV column: "Biosample genetic modifications methods/categories/
+    targets/gene targets/site coordinates/zygosity"
+
+Subject
+~~~~~~~
+Donor(s)                        → collections[].biosamples[].subjects[].local_id
+Biosample organism              → collections[].biosamples[].subjects[].taxonomy
+
+DCC
+~~~
+Static / config-derived:          dcc.id, dcc.dcc_name, dcc.dcc_abbreviation,
+                                  dcc.dcc_description, dcc.contact_email,
+                                  dcc.contact_name, dcc.dcc_url,
+                                  dcc.project_id_namespace, dcc.project_local_id
+"""
 
 import logging
 import re

diff --git a/src/cfdb/services/fourdn.py b/src/cfdb/services/fourdn.py
@@ -1,4 +1,81 @@
-"""4DN Search API client for bulk file metadata enrichment."""
+"""4DN Search API client for bulk file metadata enrichment.
+
+Fetches file, experiment, and biosource metadata from the 4DN Search API
+to enrich C2M2-materialized documents. Two enrichment passes run during
+sync: collection enrichment (pre-materialization) and file enrichment
+(post-materialization).
+
+API URLs
+--------
+File metadata:
+  https://data.4dnucleome.org/search/?type=FileProcessed
+  https://data.4dnucleome.org/search/?type=FileFastq
+Experiment metadata:
+  https://data.4dnucleome.org/search/?type=ExperimentHiC
+  https://data.4dnucleome.org/search/?type=ExperimentSeq
+  https://data.4dnucleome.org/search/?type=ExperimentDamid
+  https://data.4dnucleome.org/search/?type=ExperimentChiapet
+Biosource tiers:
+  https://data.4dnucleome.org/search/?type=Biosource&cell_line_tier=Tier+1
+  https://data.4dnucleome.org/search/?type=Biosource&cell_line_tier=Tier+2
+
+Entity Matching
+---------------
+File          persistent_id contains 4DNF[A-Z0-9]+ accession
+Collection    persistent_id contains 4DNE[A-Z][A-Z0-9]+ accession
+
+Field Mapping (4DN API → CFDB)
+-------------------------------
+
+File (post-materialization)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+genome_assembly                         → genome_assembly
+file_type                               → output_type
+file_type_detailed                      → output_type_detail
+track_and_facet_info.condition          → condition
+track_and_facet_info.assay_info         → assay_info
+track_and_facet_info.replicate_info     → biological_replicates (parsed),
+                                          technical_replicates (parsed)
+
+Enriched File
+~~~~~~~~~~~~~
+track_and_facet_info.replicate_info     → extra.replicate_info
+track_and_facet_info.biosource_name     → extra.fourdn.biosource_name
+track_and_facet_info.dataset            → extra.fourdn.dataset
+extra_files[]                           → extra.fourdn.extra_files
+  .href                                   .href
+  .md5sum                                 .md5sum
+  .file_size                              .file_size
+  .file_format                            .file_format
+Biosource.cell_line_tier (derived)      → extra.fourdn.cell_line_tier
+
+Collection (pre-materialization)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+lab.display_title                       → collections[].lab
+experiment_type.display_title           → collections[].experiment_type
+
+Enriched Collection
+~~~~~~~~~~~~~~~~~~~
+display_title                           → collections[].extra.fourdn.display_title
+digestion_enzyme.display_title          → collections[].extra.fourdn.digestion_enzyme
+targeted_factor[].display_title         → collections[].extra.fourdn.targeted_factor
+crosslinking_method                     → collections[].extra.fourdn.crosslinking_method
+crosslinking_temperature                → collections[].extra.fourdn.crosslinking_temperature
+crosslinking_time                       → collections[].extra.fourdn.crosslinking_time
+ligation_temperature                    → collections[].extra.fourdn.ligation_temperature
+ligation_volume                         → collections[].extra.fourdn.ligation_volume
+ligation_time                           → collections[].extra.fourdn.ligation_time
+digestion_temperature                   → collections[].extra.fourdn.digestion_temperature
+digestion_time                          → collections[].extra.fourdn.digestion_time
+tagging_method                          → collections[].extra.fourdn.tagging_method
+fragmentation_method                    → collections[].extra.fourdn.fragmentation_method
+biotin_removed                          → collections[].extra.fourdn.biotin_removed
+library_prep_kit                        → collections[].extra.fourdn.library_prep_kit
+average_fragment_size                   → collections[].extra.fourdn.average_fragment_size
+fragment_size_range                     → collections[].extra.fourdn.fragment_size_range
+status                                  → collections[].extra.fourdn.status
+date_created                            → collections[].extra.fourdn.date_created
+"""
 
 import asyncio
 import logging

diff --git a/src/cfdb/services/hubmap.py b/src/cfdb/services/hubmap.py
@@ -1,4 +1,68 @@
-"""HuBMAP Search API integration for access level metadata and enrichment."""
+"""HuBMAP Search API integration for access level metadata and enrichment.
+
+Fetches dataset, donor, and file metadata from the HuBMAP Search API
+(Elasticsearch-backed) to enrich C2M2-materialized documents. Three
+enrichment targets run during sync: collections and subjects
+(pre-materialization) and files (post-materialization).
+
+API URLs
+--------
+Bulk dataset search (search_after pagination):
+  https://search.api.hubmapconsortium.org/v3/portal/search
+Entity lookup (single UUID):
+  https://search.api.hubmapconsortium.org/v3/entities/{uuid}
+
+Entity Matching
+---------------
+Collection    persistent_id matches doi_url
+Subject       local_id contains donor uuid
+File          matched via collection doi_url → dataset, then filename
+
+Field Mapping (HuBMAP Search API → CFDB)
+------------------------------------------
+
+File (post-materialization)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~
+data_access_level                       → data_access_level
+ingest_metadata.workflow_description    → genome_assembly (regex-derived)
+
+Enriched File
+~~~~~~~~~~~~~
+files[].rel_path                        → extra.hubmap.rel_path
+files[].is_data_product                 → extra.hubmap.is_data_product
+
+Collection (pre-materialization)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+dataset_type                            → collections[].experiment_type
+analyte_class                           → collections[].analyte_class
+
+Enriched Collection
+~~~~~~~~~~~~~~~~~~~
+pipeline                                → collections[].extra.hubmap.pipeline
+processing                              → collections[].extra.hubmap.processing
+group_name                              → collections[].extra.hubmap.group_name
+visualization                           → collections[].extra.hubmap.visualization
+vitessce-hints                          → collections[].extra.hubmap.vitessce_hints
+metadata                                → collections[].extra.hubmap.metadata
+
+Enriched Subject (pre-materialization, from donor.mapped_metadata)
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+sex                                     → subjects[].extra.hubmap.sex
+race                                    → subjects[].extra.hubmap.race
+age_value                               → subjects[].extra.hubmap.age_value
+age_unit                                → subjects[].extra.hubmap.age_unit
+height_value                            → subjects[].extra.hubmap.height_value
+height_unit                             → subjects[].extra.hubmap.height_unit
+weight_value                            → subjects[].extra.hubmap.weight_value
+weight_unit                             → subjects[].extra.hubmap.weight_unit
+body_mass_index_value                   → subjects[].extra.hubmap.body_mass_index_value
+body_mass_index_unit                    → subjects[].extra.hubmap.body_mass_index_unit
+cause_of_death                          → subjects[].extra.hubmap.cause_of_death
+death_event                             → subjects[].extra.hubmap.death_event
+mechanism_of_injury                     → subjects[].extra.hubmap.mechanism_of_injury
+medical_history                         → subjects[].extra.hubmap.medical_history
+social_history                          → subjects[].extra.hubmap.social_history
+"""
 
 import asyncio
 import logging