populationgenomics · MattWellie · Aug 12, 2025 · Aug 12, 2025 · Aug 12, 2025 · Aug 12, 2025
diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
@@ -15,7 +15,7 @@ on:
 permissions: {}
 
 env:
-  VERSION: 0.1.6
+  VERSION: 0.1.7
   IMAGE_NAME: cpg-flow-seqr-loader
   DOCKER_DEV: australia-southeast1-docker.pkg.dev/cpg-common/images-dev
   DOCKER_MAIN: australia-southeast1-docker.pkg.dev/cpg-common/images

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -17,7 +17,7 @@ repos:
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
-    rev: v0.11.11
+    rev: v0.12.8
     hooks:
       - id: ruff
       - id: ruff-format

diff --git a/README.md b/README.md
@@ -55,7 +55,7 @@ CPG-Flow workflows are operated entirely by defining input Cohorts (see [here](h
 ```bash
 analysis-runner \
     --skip-repo-checkout \
-    --image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.6 \
+    --image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.7 \
     --config src/cpg_seqr_loader/config_template.toml \
     --config cohorts.toml \  # containing the inputs_cohorts and sequencing_type
     --dataset seqr \
@@ -70,7 +70,7 @@ analysis-runner \
 ```bash
 analysis-runner \
     --skip-repo-checkout \
-    --image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.6 \
+    --image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.7 \
     --config src/cpg_seqr_loader/config_template.toml \
     --config cohorts.toml \  # containing the inputs_cohorts and sequencing_type
     --dataset seqr \

diff --git a/pyproject.toml b/pyproject.toml
@@ -8,7 +8,7 @@ description='Seqr-Loader (gVCF-combiner) implemented in CPG-Flow'
 readme = "README.md"
 # currently cpg-flow is pinned to this version
 requires-python = ">=3.10,<3.11"
-version="0.1.6"
+version="0.1.7"
 license={"file" = "LICENSE"}
 classifiers=[
     'Environment :: Console',
@@ -54,19 +54,6 @@ include_package_data = true
 [options.package_data]
 'cpg_seqr_loader'=  ['config_template.toml']
 
-[tool.black]
-line-length = 120
-skip-string-normalization = true
-exclude = '''
-/(
-  venv
-  | \.mypy_cache
-  | \.venv
-  | build
-  | dist
-)/
-'''
-
 [tool.mypy]
 ignore_missing_imports = true
 
@@ -113,7 +100,6 @@ section-order = ["future", "standard-library", "third-party", "hail", "cpg", "fi
 cpg = ["cpg_seqr_loader", "talos", "cpg-flow", "cpg-utils"]
 hail = ["hail"]
 
-
 [tool.ruff.lint.per-file-ignores]
 # suppress the ARG002 "Unused method argument" warning in the stages.py file
 ## - we don't need generic cpg-flow arguments for every Stage, but need to fit the required method signature
@@ -122,7 +108,7 @@ hail = ["hail"]
 "src/cpg_seqr_loader/scripts/annotate_cohort.py" = ["E501"]
 
 [tool.bumpversion]
-current_version = "0.1.6"
+current_version = "0.1.7"
 parse = "(?P<major>\\d+)\\.(?P<minor>\\d+)\\.(?P<patch>\\d+)"
 serialize = ["{major}.{minor}.{patch}"]
 commit = true

diff --git a/src/cpg_seqr_loader/config_template.toml b/src/cpg_seqr_loader/config_template.toml
@@ -65,6 +65,10 @@ vep = "australia-southeast1-docker.pkg.dev/cpg-common/images/vep_110:release_110
 [references]
 vep_mount = "gs://cpg-common-main/references/vep/110/mount"
 
+[ourdna]
+exome = "gs://cpg-common-main/TEMP_ourdna_frequencies/1.3/exome/frequencies.ht"
+genome = "gs://cpg-common-main/TEMP_ourdna_frequencies/1.4/genome/frequencies.ht"
+
 [elasticsearch]
 # Configure access to ElasticSearch server
 port = '9243'

diff --git a/src/cpg_seqr_loader/scripts/annotate_cohort.py b/src/cpg_seqr_loader/scripts/annotate_cohort.py
@@ -111,6 +111,26 @@ def load_vqsr(vcf_path: str, ht_path: Path) -> hl.Table:
     return vqsr_ht.checkpoint(str(ht_path), overwrite=True)
 
 
+def annotate_ourdna(mt: hl.MatrixTable) -> hl.MatrixTable:
+    """Annotate the MatrixTable with OurDNA data (exomes & genomes)."""
+    if not (
+        (ourdna_exome_ht_path := config.config_retrieve(['ourdna', 'exome'], None))
+        and (ourdna_genome_ht_path := config.config_retrieve(['ourdna', 'genome'], None))
+    ):
+        loguru.logger.info('One or both of the OurDNA HTs are not configured, skipping annotation')
+        return mt
+
+    ourdna_exomes = hl.read_table(ourdna_exome_ht_path)
+    ourdna_genomes = hl.read_table(ourdna_genome_ht_path)
+
+    # the ht.freq block contains a list of Structs, each AC/AF/AN/homozygote_count
+    # within this list of structs, the first element is the 'adj' adjusted (qc pass) population, which is what we want
+    return mt.annotate_rows(
+        ourdna_genomes=ourdna_genomes[mt.row_key].freq[0],
+        ourdna_exomes=ourdna_exomes[mt.row_key].freq[0],
+    )
+
+
 def annotate_gnomad4(mt: hl.MatrixTable) -> hl.MatrixTable:
     """
     All steps relating to the annotation of gnomAD v4(.1) data
@@ -144,26 +164,46 @@ def annotate_gnomad4(mt: hl.MatrixTable) -> hl.MatrixTable:
     )
 
 
+def annotate_with_external_sources(
+    mt: hl.MatrixTable,
+    vep_ht_path: str,
+) -> hl.MatrixTable:
+    """
+    Annotate MatrixTable with external Tables
+    """
+
+    ref_ht_path = config.config_retrieve(['references', 'seqr_combined_reference_data'])
+    clinvar_ht_path = config.config_retrieve(['references', 'seqr_clinvar'])
+
+    loguru.logger.info(
+        f"""
+        Annotating Variant Matrix Table with additional datasets:
+        \tVEP annotations from {vep_ht_path}
+        \tReference data from {ref_ht_path}
+        \tClinVar data from {clinvar_ht_path}
+        """,
+    )
+
+    clinvar_ht = hl.read_table(clinvar_ht_path)
+    ref_ht = hl.read_table(ref_ht_path)
+    vep_ht = hl.read_table(vep_ht_path)
+
+    # join all annotation sources into the MatrixTable
+    return mt.annotate_rows(
+        clinvar_data=clinvar_ht[mt.row_key],
+        ref_data=ref_ht[mt.row_key],
+        vep=vep_ht[mt.row_key].vep,
+    )
+
+
 def annotate_cohort(
     mt_path: str,
     out_mt_path: str,
     vep_ht_path: str,
     checkpoint_prefix: str,
     vqsr_vcf_path: str | None = None,
 ) -> None:
-    """
-    Convert VCF to matrix table, annotate for Seqr Loader, add VEP and VQSR annotations.
-
-    Args:
-        mt_path ():
-        out_mt_path ():
-        vep_ht_path ():
-        checkpoint_prefix ():
-        vqsr_vcf_path ():
-
-    Returns:
-        Nothing, but hopefully writes out a new MT
-    """
+    """Convert VCF to matrix table, annotate for Seqr Loader, add VEP, OurDNA, and VQSR annotations."""
 
     hail_batch.init_batch(
         worker_memory=config.config_retrieve(['combiner', 'worker_memory']),
@@ -174,13 +214,19 @@ def annotate_cohort(
     mt = hl.read_matrix_table(mt_path)
     loguru.logger.info(f'Imported MT from {mt_path} as {mt.n_partitions()} partitions')
 
-    # Annotate VEP. Do it before splitting multi, because we run VEP on unsplit VCF,
-    # and hl.split_multi_hts can handle multiallelic VEP field.
-    vep_ht = hl.read_table(vep_ht_path)
-    loguru.logger.info(f'Adding VEP annotations into the Matrix Table from {vep_ht_path}')
-    loguru.logger.info(f'VEP loaded as {vep_ht.n_partitions()} partitions')
-
-    mt = mt.checkpoint(output=join(checkpoint_prefix, 'mt_vep.mt'), overwrite=True)
+    sequencing_type = config.config_retrieve(['workflow', 'sequencing_type'])
+    # Map sequencing type to Seqr-style string in global variables
+    # https://github.com/broadinstitute/seqr/blob/e0c179c36c0f68c892017de5eab2e4c1b9ffdc92/seqr/models.py#L592-L594
+    mt = mt.annotate_globals(
+        sourceFilePath=mt_path,
+        genomeVersion=hail_batch.genome_build().replace('GRCh', ''),
+        hail_version=hl.version(),
+        sampleType={
+            'genome': 'WGS',
+            'exome': 'WES',
+            'single_cell': 'RNA',
+        }.get(sequencing_type, ''),
+    )
 
     if vqsr_vcf_path:
         loguru.logger.info('Adding VQSR annotations into the Matrix Table')
@@ -193,35 +239,33 @@ def annotate_cohort(
         )
         mt = mt.checkpoint(output=join(checkpoint_prefix, 'mt_vep_vqsr.mt'), overwrite=True)
 
-    ref_ht = hl.read_table(config.reference_path('seqr_combined_reference_data'))
-    clinvar_ht = hl.read_table(config.reference_path('seqr_clinvar'))
+    # join all annotation sources into the MatrixTable
+    mt = annotate_with_external_sources(mt, vep_ht_path)
+    mt = annotate_ourdna(mt)
 
+    # # annotate all the gnomAD v4 fields in a separate function
+    mt = annotate_gnomad4(mt)
+
+    mt = mt.checkpoint(output=join(checkpoint_prefix, 'mt_plus_ext_tables.mt'), overwrite=True)
+
+    # update AF attributes from observed callset frequencies
     mt = hl.variant_qc(mt)
     mt = mt.annotate_rows(
+        # annotate the info struct with the variant QC data - Hail's variant_qc includes the REF allele, so skip it
         info=mt.info.annotate(
-            AF=mt.variant_qc.AF,
+            AC=[mt.variant_qc.AC[1]],
+            AF=[mt.variant_qc.AF[1]],
             AN=mt.variant_qc.AN,
-            AC=mt.variant_qc.AC,
         ),
-        vep=vep_ht[mt.row_key].vep,
+        # taking a single value here for downstream compatibility in Seqr
+        AC=mt.variant_qc.AC[1],
+        AF=mt.variant_qc.AF[1],
+        AN=mt.variant_qc.AN,
     )
     mt = mt.drop('variant_qc')
 
-    # split the AC/AF attributes into separate entries, overwriting the array in INFO
-    # these elements become a 1-element array for [ALT] instead [REF, ALT]
-    mt = mt.annotate_rows(
-        info=mt.info.annotate(
-            AF=[mt.info.AF[1]],
-            AC=[mt.info.AC[1]],
-        ),
-    )
-
-    loguru.logger.info('Annotating with clinvar and munging annotation fields')
+    loguru.logger.info('Reformatting annotated fields')
     mt = mt.annotate_rows(
-        # still taking just a single value here for downstream compatibility in Seqr
-        AC=mt.info.AC[0],
-        AF=mt.info.AF[0],
-        AN=mt.info.AN,
         aIndex=mt.a_index,
         wasSplit=mt.was_split,
         sortedTranscriptConsequences=vep.get_expr_for_vep_sorted_transcript_consequences_array(mt.vep),
@@ -235,13 +279,8 @@ def annotate_cohort(
         xpos=variant_id.get_expr_for_xpos(mt.locus),
         xstart=variant_id.get_expr_for_xpos(mt.locus),
         xstop=variant_id.get_expr_for_xpos(mt.locus) + hl.len(mt.alleles[0]) - 1,
-        clinvar_data=clinvar_ht[mt.row_key],
-        ref_data=ref_ht[mt.row_key],
     )
 
-    # annotate all the gnomAD v4 fields in a separate function
-    mt = annotate_gnomad4(mt)
-
     # this was previously executed in the MtToEs job, as it wasn't possible on QoB
     loguru.logger.info('Adding GRCh37 coords')
     liftover_path = config.reference_path('liftover_38_to_37')
@@ -285,21 +324,6 @@ def annotate_cohort(
             gold_stars=mt.clinvar_data.gold_stars,
         ),
     )
-    mt = mt.annotate_globals(
-        sourceFilePath=mt_path,
-        genomeVersion=hail_batch.genome_build().replace('GRCh', ''),
-        hail_version=hl.version(),
-    )
-    if sequencing_type := config.config_retrieve(['workflow', 'sequencing_type']):
-        # Map to Seqr-style string
-        # https://github.com/broadinstitute/seqr/blob/e0c179c36c0f68c892017de5eab2e4c1b9ffdc92/seqr/models.py#L592-L594
-        mt = mt.annotate_globals(
-            sampleType={
-                'genome': 'WGS',
-                'exome': 'WES',
-                'single_cell': 'RNA',
-            }.get(sequencing_type, ''),
-        )
 
     loguru.logger.info('Final Structure:')
     mt.write(out_mt_path, overwrite=True)