populationgenomics · MattWellie · Sep 29, 2025 · Sep 29, 2025
diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml
@@ -15,7 +15,7 @@ on:
 permissions: {}
 
 env:
-  VERSION: 0.1.7
+  VERSION: 0.1.8
   IMAGE_NAME: cpg-flow-seqr-loader
   DOCKER_DEV: australia-southeast1-docker.pkg.dev/cpg-common/images-dev
   DOCKER_MAIN: australia-southeast1-docker.pkg.dev/cpg-common/images

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -17,7 +17,7 @@ repos:
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
     # Ruff version.
-    rev: v0.11.11
+    rev: v0.12.11
     hooks:
       - id: ruff
       - id: ruff-format

diff --git a/README.md b/README.md
@@ -55,7 +55,7 @@ CPG-Flow workflows are operated entirely by defining input Cohorts (see [here](h
 ```bash
 analysis-runner \
     --skip-repo-checkout \
-    --image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.7 \
+    --image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.8 \
     --config src/cpg_seqr_loader/config_template.toml \
     --config cohorts.toml \  # containing the inputs_cohorts and sequencing_type
     --dataset seqr \
@@ -70,7 +70,7 @@ analysis-runner \
 ```bash
 analysis-runner \
     --skip-repo-checkout \
-    --image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.7 \
+    --image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.8 \
     --config src/cpg_seqr_loader/config_template.toml \
     --config cohorts.toml \  # containing the inputs_cohorts and sequencing_type
     --dataset seqr \

diff --git a/pyproject.toml b/pyproject.toml
@@ -8,7 +8,7 @@ description='Seqr-Loader (gVCF-combiner) implemented in CPG-Flow'
 readme = "README.md"
 # currently cpg-flow is pinned to this version
 requires-python = ">=3.10,<3.11"
-version="0.1.7"
+version="0.1.8"
 license={"file" = "LICENSE"}
 classifiers=[
     'Environment :: Console',
@@ -122,7 +122,7 @@ hail = ["hail"]
 "src/cpg_seqr_loader/scripts/annotate_cohort.py" = ["E501"]
 
 [tool.bumpversion]
-current_version = "0.1.7"
+current_version = "0.1.8"
 parse = "(?P<major>\\d+)\\.(?P<minor>\\d+)\\.(?P<patch>\\d+)"
 serialize = ["{major}.{minor}.{patch}"]
 commit = true

diff --git a/src/cpg_seqr_loader/config_template.toml b/src/cpg_seqr_loader/config_template.toml
@@ -66,11 +66,25 @@ snps_recal_disc_size = 20
 snps_gather_disc_size = 10
 
 [images]
-bcftools = "australia-southeast1-docker.pkg.dev/cpg-common/images/bcftools_120:1.20"
-gatk = "australia-southeast1-docker.pkg.dev/cpg-common/images/gatk:4.2.6.1"
-vep = "australia-southeast1-docker.pkg.dev/cpg-common/images/vep_110:release_110.1"
+bcftools = "australia-southeast1-docker.pkg.dev/cpg-common/images/bcftools:1.22-1"
+gatk = "australia-southeast1-docker.pkg.dev/cpg-common/images/gatk:4.6.2.0-1"
+vep = "australia-southeast1-docker.pkg.dev/cpg-common/images/vep:110.1-1"
 
 [references]
+liftover_38_to_37 = "gs://cpg-common-main/references/liftover/grch38_to_grch37.over.chain.gz"
+hapmap_vcf = "gs://cpg-common-main/references/hg38/v0/hapmap_3.3.hg38.vcf.gz"
+hapmap_vcf_index = "gs://cpg-common-main/references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi"
+omni_vcf = "gs://cpg-common-main/references/hg38/v0/1000G_omni2.5.hg38.vcf.gz"
+omni_vcf_index = "gs://cpg-common-main/references/hg38/v0/1000G_omni2.5.hg38.vcf.gz.tbi"
+one_thousand_genomes_vcf = "gs://cpg-common-main/references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz"
+one_thousand_genomes_vcf_index = "gs://cpg-common-main/references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi"
+mills_vcf = "gs://cpg-common-main/references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz"
+mills_vcf_index = "gs://cpg-common-main/references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi"
+axiom_poly_vcf = "gs://cpg-common-main/references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz"
+axiom_poly_vcf_index = "gs://cpg-common-main/references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi"
+"gnomad_4.1_joint_ht" = "gs://cpg-common-main/references/gnomad/v4.1/joint/ht/gnomad.joint.v4.1.sites.ht"
+seqr_clinvar = "gs://cpg-common-main/references/seqr/v0/clinvar.GRCh38.ht"
+seqr_combined_reference_data = "gs://cpg-common-main/references/seqr/v0/combined_reference_data_grch38.ht"
 vep_mount = "gs://cpg-common-main/references/vep/110/mount"
 
 [elasticsearch]

diff --git a/src/cpg_seqr_loader/scripts/annotate_cohort.py b/src/cpg_seqr_loader/scripts/annotate_cohort.py
@@ -122,7 +122,7 @@ def annotate_gnomad4(mt: hl.MatrixTable) -> hl.MatrixTable:
         same MT, with gnomAD 4 annotations placed into the INFO struct as a nested Struct
     """
 
-    gnomad4_ht = hl.read_table(config.reference_path('gnomad_4.1_joint_ht'))
+    gnomad4_ht = hl.read_table(config.config_retrieve(['references', 'gnomad_4.1_joint_ht']))
 
     # the index of the target populations in the joint.freq array
     target_index = hl.eval(gnomad4_ht.globals.joint_globals.freq_index_dict[GNOMAD_TARGET_POP])
@@ -151,19 +151,7 @@ def annotate_cohort(
     checkpoint_prefix: str,
     vqsr_vcf_path: str | None = None,
 ) -> None:
-    """
-    Convert VCF to matrix table, annotate for Seqr Loader, add VEP and VQSR annotations.
-
-    Args:
-        mt_path ():
-        out_mt_path ():
-        vep_ht_path ():
-        checkpoint_prefix ():
-        vqsr_vcf_path ():
-
-    Returns:
-        Nothing, but hopefully writes out a new MT
-    """
+    """Convert VCF to matrix table, annotate for Seqr Loader, add VEP and VQSR annotations."""
 
     hail_batch.init_batch(
         worker_memory=config.config_retrieve(['combiner', 'worker_memory']),
@@ -193,8 +181,8 @@ def annotate_cohort(
         )
         mt = mt.checkpoint(output=join(checkpoint_prefix, 'mt_vep_vqsr.mt'), overwrite=True)
 
-    ref_ht = hl.read_table(config.reference_path('seqr_combined_reference_data'))
-    clinvar_ht = hl.read_table(config.reference_path('seqr_clinvar'))
+    ref_ht = hl.read_table(config.config_retrieve(['references', 'seqr_combined_reference_data']))
+    clinvar_ht = hl.read_table(config.config_retrieve(['references', 'seqr_clinvar']))
 
     mt = hl.variant_qc(mt)
     mt = mt.annotate_rows(
@@ -244,7 +232,7 @@ def annotate_cohort(
 
     # this was previously executed in the MtToEs job, as it wasn't possible on QoB
     loguru.logger.info('Adding GRCh37 coords')
-    liftover_path = config.reference_path('liftover_38_to_37')
+    liftover_path = config.config_retrieve(['references', 'liftover_38_to_37'])
     rg37 = hl.get_reference('GRCh37')
     rg38 = hl.get_reference('GRCh38')
     rg38.add_liftover(liftover_path, rg37)
@@ -269,10 +257,7 @@ def annotate_cohort(
         cadd=mt.ref_data.cadd,
         dbnsfp=mt.ref_data.dbnsfp,
         geno2mp=mt.ref_data.geno2mp,
-        gnomad_exomes=mt.ref_data.gnomad_exomes,
-        gnomad_exome_coverage=mt.ref_data.gnomad_exome_coverage,
-        gnomad_genomes=mt.ref_data.gnomad_genomes,
-        gnomad_genome_coverage=mt.ref_data.gnomad_genome_coverage,
+        # we previously took the gnomAD data from the ref_data object, but now we're trying to move to gnomAD only
         eigen=mt.ref_data.eigen,
         exac=mt.ref_data.exac,
         g1k=mt.ref_data.g1k,

diff --git a/src/cpg_seqr_loader/utils.py b/src/cpg_seqr_loader/utils.py
@@ -203,8 +203,8 @@ def get_localised_resources_for_vqsr() -> dict[str, 'ResourceGroup']:
 
     return {
         key: hail_batch.get_batch().read_input_group(
-            base=config.reference_path(f'broad/{key}_vcf'),
-            index=config.reference_path(f'broad/{key}_vcf_index'),
+            base=config.config_retrieve(['references', f'{key}_vcf']),
+            index=config.config_retrieve(['references', f'{key}_vcf_index']),
         )
         for key in [
             'axiom_poly',