From f7ede6fc4470c142fc4a18bc57f3c93e4891647e Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Mon, 9 Feb 2026 18:00:09 +1100
Subject: [PATCH 01/10] annotate_cohort changes the annotate_cohort step to
 provide an optional extra annotation

---
 .../scripts/annotate_cohort.py                | 48 +++++++++++--------
 1 file changed, 28 insertions(+), 20 deletions(-)

diff --git a/src/cpg_seqr_loader/scripts/annotate_cohort.py b/src/cpg_seqr_loader/scripts/annotate_cohort.py
index 4e0662e..bd81fc0 100644
--- a/src/cpg_seqr_loader/scripts/annotate_cohort.py
+++ b/src/cpg_seqr_loader/scripts/annotate_cohort.py
@@ -217,27 +217,35 @@ def annotate_cohort(
     )
 
     loguru.logger.info('Annotating with clinvar and munging annotation fields')
-    mt = mt.annotate_rows(
+    # Common annotations for all cases
+    base_annotations = {
         # still taking just a single value here for downstream compatibility in Seqr
-        AC=mt.info.AC[0],
-        AF=mt.info.AF[0],
-        AN=mt.info.AN,
-        aIndex=mt.a_index,
-        wasSplit=mt.was_split,
-        sortedTranscriptConsequences=vep.get_expr_for_vep_sorted_transcript_consequences_array(mt.vep),
-        variantId=variant_id.get_expr_for_variant_id(mt),
-        contig=variant_id.get_expr_for_contig(mt.locus),
-        pos=mt.locus.position,
-        start=mt.locus.position,
-        end=mt.locus.position + hl.len(mt.alleles[0]) - 1,
-        ref=mt.alleles[0],
-        alt=mt.alleles[1],
-        xpos=variant_id.get_expr_for_xpos(mt.locus),
-        xstart=variant_id.get_expr_for_xpos(mt.locus),
-        xstop=variant_id.get_expr_for_xpos(mt.locus) + hl.len(mt.alleles[0]) - 1,
-        clinvar_data=clinvar_ht[mt.row_key],
-        ref_data=ref_ht[mt.row_key],
-    )
+        'AC': mt.info.AC[0],
+        'AF': mt.info.AF[0],
+        'AN': mt.info.AN,
+        'aIndex': mt.a_index,
+        'wasSplit': mt.was_split,
+        'sortedTranscriptConsequences': vep.get_expr_for_vep_sorted_transcript_consequences_array(mt.vep),
+        'variantId': variant_id.get_expr_for_variant_id(mt),
+        'contig': variant_id.get_expr_for_contig(mt.locus),
+        'pos': mt.locus.position,
+        'start': mt.locus.position,
+        'end': mt.locus.position + hl.len(mt.alleles[0]) - 1,
+        'ref': mt.alleles[0],
+        'alt': mt.alleles[1],
+        'xpos': variant_id.get_expr_for_xpos(mt.locus),
+        'xstart': variant_id.get_expr_for_xpos(mt.locus),
+        'xstop': variant_id.get_expr_for_xpos(mt.locus) + hl.len(mt.alleles[0]) - 1,
+        'clinvar_data': clinvar_ht[mt.row_key],
+        'ref_data': ref_ht[mt.row_key],
+    }
+
+    # Add optional avis annotation if available
+    if config.reference_path('seqr_combined_reference_optional'):
+        refavis_ht = hl.read_table(config.reference_path('seqr_combined_reference_optional'))
+        base_annotations['avis'] = refavis_ht[mt.row_key].avis
+
+    mt = mt.annotate_rows(**base_annotations)
 
     # annotate all the gnomAD v4 fields in a separate function
     mt = annotate_gnomad4(mt)

From 1f5178156dd15e04b7a9f9ac0aa5c7f6c75a0009 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Tue, 10 Feb 2026 11:18:26 +1100
Subject: [PATCH 02/10] seperating the change from the current code

---
 .../scripts/annotate_cohort.py                | 48 +++++++++----------
 1 file changed, 22 insertions(+), 26 deletions(-)

diff --git a/src/cpg_seqr_loader/scripts/annotate_cohort.py b/src/cpg_seqr_loader/scripts/annotate_cohort.py
index bd81fc0..0a53e3a 100644
--- a/src/cpg_seqr_loader/scripts/annotate_cohort.py
+++ b/src/cpg_seqr_loader/scripts/annotate_cohort.py
@@ -217,35 +217,31 @@ def annotate_cohort(
     )
 
     loguru.logger.info('Annotating with clinvar and munging annotation fields')
-    # Common annotations for all cases
-    base_annotations = {
+    mt = mt.annotate_rows(
         # still taking just a single value here for downstream compatibility in Seqr
-        'AC': mt.info.AC[0],
-        'AF': mt.info.AF[0],
-        'AN': mt.info.AN,
-        'aIndex': mt.a_index,
-        'wasSplit': mt.was_split,
-        'sortedTranscriptConsequences': vep.get_expr_for_vep_sorted_transcript_consequences_array(mt.vep),
-        'variantId': variant_id.get_expr_for_variant_id(mt),
-        'contig': variant_id.get_expr_for_contig(mt.locus),
-        'pos': mt.locus.position,
-        'start': mt.locus.position,
-        'end': mt.locus.position + hl.len(mt.alleles[0]) - 1,
-        'ref': mt.alleles[0],
-        'alt': mt.alleles[1],
-        'xpos': variant_id.get_expr_for_xpos(mt.locus),
-        'xstart': variant_id.get_expr_for_xpos(mt.locus),
-        'xstop': variant_id.get_expr_for_xpos(mt.locus) + hl.len(mt.alleles[0]) - 1,
-        'clinvar_data': clinvar_ht[mt.row_key],
-        'ref_data': ref_ht[mt.row_key],
-    }
-
-    # Add optional avis annotation if available
+        AC=mt.info.AC[0],
+        AF=mt.info.AF[0],
+        AN=mt.info.AN,
+        aIndex=mt.a_index,
+        wasSplit=mt.was_split,
+        sortedTranscriptConsequences=vep.get_expr_for_vep_sorted_transcript_consequences_array(mt.vep),
+        variantId=variant_id.get_expr_for_variant_id(mt),
+        contig=variant_id.get_expr_for_contig(mt.locus),
+        pos=mt.locus.position,
+        start=mt.locus.position,
+        end=mt.locus.position + hl.len(mt.alleles[0]) - 1,
+        ref=mt.alleles[0],
+        alt=mt.alleles[1],
+        xpos=variant_id.get_expr_for_xpos(mt.locus),
+        xstart=variant_id.get_expr_for_xpos(mt.locus),
+        xstop=variant_id.get_expr_for_xpos(mt.locus) + hl.len(mt.alleles[0]) - 1,
+        clinvar_data=clinvar_ht[mt.row_key],
+        ref_data=ref_ht[mt.row_key],
+    )
     if config.reference_path('seqr_combined_reference_optional'):
         refavis_ht = hl.read_table(config.reference_path('seqr_combined_reference_optional'))
-        base_annotations['avis'] = refavis_ht[mt.row_key].avis
-
-    mt = mt.annotate_rows(**base_annotations)
+        loguru.logger.info('Annotating with refavis data')
+        mt = mt.annotate_rows(avis = (refavis_ht[mt.row_key].avis,))
 
     # annotate all the gnomAD v4 fields in a separate function
     mt = annotate_gnomad4(mt)

From 9d3b02253be664abfcd3f1cb2dba877b7d17e121 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Tue, 10 Feb 2026 11:39:40 +1100
Subject: [PATCH 03/10] linting

---
 src/cpg_seqr_loader/scripts/annotate_cohort.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpg_seqr_loader/scripts/annotate_cohort.py b/src/cpg_seqr_loader/scripts/annotate_cohort.py
index 0a53e3a..a6fdfbd 100644
--- a/src/cpg_seqr_loader/scripts/annotate_cohort.py
+++ b/src/cpg_seqr_loader/scripts/annotate_cohort.py
@@ -241,7 +241,7 @@ def annotate_cohort(
     if config.reference_path('seqr_combined_reference_optional'):
         refavis_ht = hl.read_table(config.reference_path('seqr_combined_reference_optional'))
         loguru.logger.info('Annotating with refavis data')
-        mt = mt.annotate_rows(avis = (refavis_ht[mt.row_key].avis,))
+        mt = mt.annotate_rows(avis=(refavis_ht[mt.row_key].avis,))
 
     # annotate all the gnomAD v4 fields in a separate function
     mt = annotate_gnomad4(mt)

From 33eac5b6f1e0a1040f12d3ad285b4295c978aa19 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <117962983+Johnnyassaf@users.noreply.github.com>
Date: Tue, 10 Feb 2026 15:11:48 +1100
Subject: [PATCH 04/10] Adding a walrus operator to preempt a empty field
 exception for an optional annotation in annotate_cohort

Co-authored-by: Matt Welland <mattwellie@gmail.com>
---
 src/cpg_seqr_loader/scripts/annotate_cohort.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cpg_seqr_loader/scripts/annotate_cohort.py b/src/cpg_seqr_loader/scripts/annotate_cohort.py
index a6fdfbd..b581e04 100644
--- a/src/cpg_seqr_loader/scripts/annotate_cohort.py
+++ b/src/cpg_seqr_loader/scripts/annotate_cohort.py
@@ -238,8 +238,8 @@ def annotate_cohort(
         clinvar_data=clinvar_ht[mt.row_key],
         ref_data=ref_ht[mt.row_key],
     )
-    if config.reference_path('seqr_combined_reference_optional'):
-        refavis_ht = hl.read_table(config.reference_path('seqr_combined_reference_optional'))
+    if avi_table := config.reference_path('avi_table', None):
+        refavis_ht = hl.read_table(avi_table)
         loguru.logger.info('Annotating with refavis data')
         mt = mt.annotate_rows(avis=(refavis_ht[mt.row_key].avis,))
 

From b20d05b273581ab372ba3d4f8eede86138e864da Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Wed, 11 Feb 2026 15:28:54 +1100
Subject: [PATCH 05/10] bugfix

---
 src/cpg_seqr_loader/scripts/annotate_cohort.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/cpg_seqr_loader/scripts/annotate_cohort.py b/src/cpg_seqr_loader/scripts/annotate_cohort.py
index b581e04..c572327 100644
--- a/src/cpg_seqr_loader/scripts/annotate_cohort.py
+++ b/src/cpg_seqr_loader/scripts/annotate_cohort.py
@@ -238,10 +238,11 @@ def annotate_cohort(
         clinvar_data=clinvar_ht[mt.row_key],
         ref_data=ref_ht[mt.row_key],
     )
-    if avi_table := config.reference_path('avi_table', None):
+    if avi_table := config.config_retrieve(['references','avi_table'], None):
         refavis_ht = hl.read_table(avi_table)
         loguru.logger.info('Annotating with refavis data')
         mt = mt.annotate_rows(avis=(refavis_ht[mt.row_key].avis,))
+        mt.describe()
 
     # annotate all the gnomAD v4 fields in a separate function
     mt = annotate_gnomad4(mt)

From 6300d00ad4dae45f8a9ced6156cb8a4e7d74112a Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Wed, 11 Feb 2026 17:52:52 +1100
Subject: [PATCH 06/10] bugfix

---
 src/cpg_seqr_loader/config_template.toml | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/cpg_seqr_loader/config_template.toml b/src/cpg_seqr_loader/config_template.toml
index 9483aae..8cc5fc8 100644
--- a/src/cpg_seqr_loader/config_template.toml
+++ b/src/cpg_seqr_loader/config_template.toml
@@ -27,9 +27,9 @@ force_new_combiner = false
 # highem, standard, or a string, e.g. "4Gi"
 driver_memory = "highmem"
 # string, e.g. "4Gi"
-driver_storage = "10Gi"
+driver_storage = "55Gi"
 # integer
-driver_cores = 2
+driver_cores = 4
 # highem, standard, or a string, e.g. "4Gi"
 worker_memory = "highmem"
 

From 6e439758609d394a13f0119f10e2e4a5b6fa55c5 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Mon, 16 Feb 2026 14:08:47 +1100
Subject: [PATCH 07/10] fixing_reference

---
 src/cpg_seqr_loader/scripts/annotate_cohort.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/cpg_seqr_loader/scripts/annotate_cohort.py b/src/cpg_seqr_loader/scripts/annotate_cohort.py
index c572327..41ea611 100644
--- a/src/cpg_seqr_loader/scripts/annotate_cohort.py
+++ b/src/cpg_seqr_loader/scripts/annotate_cohort.py
@@ -241,7 +241,7 @@ def annotate_cohort(
     if avi_table := config.config_retrieve(['references','avi_table'], None):
         refavis_ht = hl.read_table(avi_table)
         loguru.logger.info('Annotating with refavis data')
-        mt = mt.annotate_rows(avis=(refavis_ht[mt.row_key].avis,))
+        mt = mt.annotate_rows(avis=(refavis_ht[mt.row_key].normalised_avis,))
         mt.describe()
 
     # annotate all the gnomAD v4 fields in a separate function

From f5d2caf71efc672c7ee806f27772456db4c17b50 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Mon, 16 Feb 2026 15:47:27 +1100
Subject: [PATCH 08/10] linting

---
 src/cpg_seqr_loader/config_template.toml       | 8 +++-----
 src/cpg_seqr_loader/scripts/annotate_cohort.py | 2 +-
 2 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/cpg_seqr_loader/config_template.toml b/src/cpg_seqr_loader/config_template.toml
index 8cc5fc8..b2bc7bb 100644
--- a/src/cpg_seqr_loader/config_template.toml
+++ b/src/cpg_seqr_loader/config_template.toml
@@ -1,7 +1,6 @@
 [workflow]
 
 name = 'seqr_loader'
-
 # used to make sure we don't repeat previously completed stages
 check_expected_outputs = true
 
@@ -27,9 +26,9 @@ force_new_combiner = false
 # highem, standard, or a string, e.g. "4Gi"
 driver_memory = "highmem"
 # string, e.g. "4Gi"
-driver_storage = "55Gi"
+driver_storage = "10Gi"
 # integer
-driver_cores = 4
+driver_cores = 8
 # highem, standard, or a string, e.g. "4Gi"
 worker_memory = "highmem"
 
@@ -49,7 +48,7 @@ worker_memory = "highmem"
 # highem, standard, or a string, e.g. "4Gi"
 driver_memory = "highmem"
 # integer
-driver_cores = 2
+driver_cores = 4
 
 [vcf_from_mt]
 # highem, standard, or a string, e.g. "4Gi"
@@ -84,7 +83,6 @@ liftover_38_to_37 = "gs://cpg-common-main/references/liftover/grch38_to_grch37.o
 seqr_clinvar = "gs://cpg-common-main/references/seqr/v0/clinvar.GRCh38.ht"
 seqr_combined_reference_data = "gs://cpg-common-main/references/seqr/v0/combined_reference_data_grch38.ht"
 vep_mount = "gs://cpg-common-main/references/vep/110/mount"
-
 # these are all related to VQSR
 axiom_poly_vcf = "gs://cpg-common-main/references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz"
 axiom_poly_vcf_index = "gs://cpg-common-main/references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi"
diff --git a/src/cpg_seqr_loader/scripts/annotate_cohort.py b/src/cpg_seqr_loader/scripts/annotate_cohort.py
index 41ea611..2dfd58a 100644
--- a/src/cpg_seqr_loader/scripts/annotate_cohort.py
+++ b/src/cpg_seqr_loader/scripts/annotate_cohort.py
@@ -238,7 +238,7 @@ def annotate_cohort(
         clinvar_data=clinvar_ht[mt.row_key],
         ref_data=ref_ht[mt.row_key],
     )
-    if avi_table := config.config_retrieve(['references','avi_table'], None):
+    if avi_table := config.config_retrieve(['references', 'avi_table'], None):
         refavis_ht = hl.read_table(avi_table)
         loguru.logger.info('Annotating with refavis data')
         mt = mt.annotate_rows(avis=(refavis_ht[mt.row_key].normalised_avis,))

From 920f49e98114fe6a4c55ab59e4515b1ad3771027 Mon Sep 17 00:00:00 2001
From: Johnnyassaf <117962983+Johnnyassaf@users.noreply.github.com>
Date: Mon, 16 Feb 2026 16:40:41 +1100
Subject: [PATCH 09/10] Update src/cpg_seqr_loader/config_template.toml

remove unnecessary elements

Co-authored-by: Matt Welland <mattwellie@gmail.com>
---
 src/cpg_seqr_loader/config_template.toml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/cpg_seqr_loader/config_template.toml b/src/cpg_seqr_loader/config_template.toml
index 5b67974..14c7677 100644
--- a/src/cpg_seqr_loader/config_template.toml
+++ b/src/cpg_seqr_loader/config_template.toml
@@ -1,9 +1,5 @@
 [workflow]
 
-name = 'seqr_loader'
-# used to make sure we don't repeat previously completed stages
-check_expected_outputs = true
-
 # the method to register outputs, can be missing - will not generate metamist analysis entries
 status_reporter = 'metamist'
 

From 7e2c0cadd23c6918be0f94f8cae80e13de594d2c Mon Sep 17 00:00:00 2001
From: Johnnyassaf <johnny.assaf@populationgenomics.org.au>
Date: Mon, 16 Feb 2026 16:57:17 +1100
Subject: [PATCH 10/10] =?UTF-8?q?Bump=20version:=200.1.16=20=E2=86=92=200.?=
 =?UTF-8?q?1.17?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 Dockerfile     | 2 +-
 README.md      | 4 ++--
 pyproject.toml | 4 ++--
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 0b2492c..d9b0710 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,7 +1,7 @@
 FROM australia-southeast1-docker.pkg.dev/cpg-common/images/cpg_hail_gcloud:0.2.137.cpg1-2
 
 ENV PYTHONDONTWRITEBYTECODE=1
-ENV VERSION=0.1.16
+ENV VERSION=0.1.17
 
 WORKDIR /cpg_seqr_loader
 
diff --git a/README.md b/README.md
index 5d71eb4..7d84af6 100644
--- a/README.md
+++ b/README.md
@@ -55,7 +55,7 @@ CPG-Flow workflows are operated entirely by defining input Cohorts (see [here](h
 ```bash
 analysis-runner \
     --skip-repo-checkout \
-    --image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.16 \
+    --image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.17 \
     --config src/cpg_seqr_loader/config_template.toml \
     --config cohorts.toml \  # containing the inputs_cohorts and sequencing_type
     --dataset seqr \
@@ -70,7 +70,7 @@ analysis-runner \
 ```bash
 analysis-runner \
     --skip-repo-checkout \
-    --image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.16 \
+    --image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.17 \
     --config src/cpg_seqr_loader/config_template.toml \
     --config cohorts.toml \  # containing the inputs_cohorts and sequencing_type
     --dataset seqr \
diff --git a/pyproject.toml b/pyproject.toml
index 5e10e13..94c34a8 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -8,7 +8,7 @@ description='Seqr-Loader (gVCF-combiner) implemented in CPG-Flow'
 readme = "README.md"
 # currently cpg-flow is pinned to this version
 requires-python = ">=3.10,<3.12"
-version="0.1.16"
+version="0.1.17"
 license={"file" = "LICENSE"}
 classifiers=[
     'Environment :: Console',
@@ -120,7 +120,7 @@ hail = ["hail"]
 "src/cpg_seqr_loader/scripts/annotate_cohort.py" = ["E501"]
 
 [tool.bumpversion]
-current_version = "0.1.16"
+current_version = "0.1.17"
 parse = "(?P<major>\\d+)\\.(?P<minor>\\d+)\\.(?P<patch>\\d+)"
 serialize = ["{major}.{minor}.{patch}"]
 commit = true