From f7ede6fc4470c142fc4a18bc57f3c93e4891647e Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Mon, 9 Feb 2026 18:00:09 +1100 Subject: [PATCH 01/10] annotate_cohort changes the annotate_cohort step to provide an optional extra annotation --- .../scripts/annotate_cohort.py | 48 +++++++++++-------- 1 file changed, 28 insertions(+), 20 deletions(-) diff --git a/src/cpg_seqr_loader/scripts/annotate_cohort.py b/src/cpg_seqr_loader/scripts/annotate_cohort.py index 4e0662e..bd81fc0 100644 --- a/src/cpg_seqr_loader/scripts/annotate_cohort.py +++ b/src/cpg_seqr_loader/scripts/annotate_cohort.py @@ -217,27 +217,35 @@ def annotate_cohort( ) loguru.logger.info('Annotating with clinvar and munging annotation fields') - mt = mt.annotate_rows( + # Common annotations for all cases + base_annotations = { # still taking just a single value here for downstream compatibility in Seqr - AC=mt.info.AC[0], - AF=mt.info.AF[0], - AN=mt.info.AN, - aIndex=mt.a_index, - wasSplit=mt.was_split, - sortedTranscriptConsequences=vep.get_expr_for_vep_sorted_transcript_consequences_array(mt.vep), - variantId=variant_id.get_expr_for_variant_id(mt), - contig=variant_id.get_expr_for_contig(mt.locus), - pos=mt.locus.position, - start=mt.locus.position, - end=mt.locus.position + hl.len(mt.alleles[0]) - 1, - ref=mt.alleles[0], - alt=mt.alleles[1], - xpos=variant_id.get_expr_for_xpos(mt.locus), - xstart=variant_id.get_expr_for_xpos(mt.locus), - xstop=variant_id.get_expr_for_xpos(mt.locus) + hl.len(mt.alleles[0]) - 1, - clinvar_data=clinvar_ht[mt.row_key], - ref_data=ref_ht[mt.row_key], - ) + 'AC': mt.info.AC[0], + 'AF': mt.info.AF[0], + 'AN': mt.info.AN, + 'aIndex': mt.a_index, + 'wasSplit': mt.was_split, + 'sortedTranscriptConsequences': vep.get_expr_for_vep_sorted_transcript_consequences_array(mt.vep), + 'variantId': variant_id.get_expr_for_variant_id(mt), + 'contig': variant_id.get_expr_for_contig(mt.locus), + 'pos': mt.locus.position, + 'start': mt.locus.position, + 'end': mt.locus.position + hl.len(mt.alleles[0]) - 1, + 'ref': mt.alleles[0], + 'alt': mt.alleles[1], + 'xpos': variant_id.get_expr_for_xpos(mt.locus), + 'xstart': variant_id.get_expr_for_xpos(mt.locus), + 'xstop': variant_id.get_expr_for_xpos(mt.locus) + hl.len(mt.alleles[0]) - 1, + 'clinvar_data': clinvar_ht[mt.row_key], + 'ref_data': ref_ht[mt.row_key], + } + + # Add optional avis annotation if available + if config.reference_path('seqr_combined_reference_optional'): + refavis_ht = hl.read_table(config.reference_path('seqr_combined_reference_optional')) + base_annotations['avis'] = refavis_ht[mt.row_key].avis + + mt = mt.annotate_rows(**base_annotations) # annotate all the gnomAD v4 fields in a separate function mt = annotate_gnomad4(mt) From 1f5178156dd15e04b7a9f9ac0aa5c7f6c75a0009 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Tue, 10 Feb 2026 11:18:26 +1100 Subject: [PATCH 02/10] seperating the change from the current code --- .../scripts/annotate_cohort.py | 48 +++++++++---------- 1 file changed, 22 insertions(+), 26 deletions(-) diff --git a/src/cpg_seqr_loader/scripts/annotate_cohort.py b/src/cpg_seqr_loader/scripts/annotate_cohort.py index bd81fc0..0a53e3a 100644 --- a/src/cpg_seqr_loader/scripts/annotate_cohort.py +++ b/src/cpg_seqr_loader/scripts/annotate_cohort.py @@ -217,35 +217,31 @@ def annotate_cohort( ) loguru.logger.info('Annotating with clinvar and munging annotation fields') - # Common annotations for all cases - base_annotations = { + mt = mt.annotate_rows( # still taking just a single value here for downstream compatibility in Seqr - 'AC': mt.info.AC[0], - 'AF': mt.info.AF[0], - 'AN': mt.info.AN, - 'aIndex': mt.a_index, - 'wasSplit': mt.was_split, - 'sortedTranscriptConsequences': vep.get_expr_for_vep_sorted_transcript_consequences_array(mt.vep), - 'variantId': variant_id.get_expr_for_variant_id(mt), - 'contig': variant_id.get_expr_for_contig(mt.locus), - 'pos': mt.locus.position, - 'start': mt.locus.position, - 'end': mt.locus.position + hl.len(mt.alleles[0]) - 1, - 'ref': mt.alleles[0], - 'alt': mt.alleles[1], - 'xpos': variant_id.get_expr_for_xpos(mt.locus), - 'xstart': variant_id.get_expr_for_xpos(mt.locus), - 'xstop': variant_id.get_expr_for_xpos(mt.locus) + hl.len(mt.alleles[0]) - 1, - 'clinvar_data': clinvar_ht[mt.row_key], - 'ref_data': ref_ht[mt.row_key], - } - - # Add optional avis annotation if available + AC=mt.info.AC[0], + AF=mt.info.AF[0], + AN=mt.info.AN, + aIndex=mt.a_index, + wasSplit=mt.was_split, + sortedTranscriptConsequences=vep.get_expr_for_vep_sorted_transcript_consequences_array(mt.vep), + variantId=variant_id.get_expr_for_variant_id(mt), + contig=variant_id.get_expr_for_contig(mt.locus), + pos=mt.locus.position, + start=mt.locus.position, + end=mt.locus.position + hl.len(mt.alleles[0]) - 1, + ref=mt.alleles[0], + alt=mt.alleles[1], + xpos=variant_id.get_expr_for_xpos(mt.locus), + xstart=variant_id.get_expr_for_xpos(mt.locus), + xstop=variant_id.get_expr_for_xpos(mt.locus) + hl.len(mt.alleles[0]) - 1, + clinvar_data=clinvar_ht[mt.row_key], + ref_data=ref_ht[mt.row_key], + ) if config.reference_path('seqr_combined_reference_optional'): refavis_ht = hl.read_table(config.reference_path('seqr_combined_reference_optional')) - base_annotations['avis'] = refavis_ht[mt.row_key].avis - - mt = mt.annotate_rows(**base_annotations) + loguru.logger.info('Annotating with refavis data') + mt = mt.annotate_rows(avis = (refavis_ht[mt.row_key].avis,)) # annotate all the gnomAD v4 fields in a separate function mt = annotate_gnomad4(mt) From 9d3b02253be664abfcd3f1cb2dba877b7d17e121 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Tue, 10 Feb 2026 11:39:40 +1100 Subject: [PATCH 03/10] linting --- src/cpg_seqr_loader/scripts/annotate_cohort.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpg_seqr_loader/scripts/annotate_cohort.py b/src/cpg_seqr_loader/scripts/annotate_cohort.py index 0a53e3a..a6fdfbd 100644 --- a/src/cpg_seqr_loader/scripts/annotate_cohort.py +++ b/src/cpg_seqr_loader/scripts/annotate_cohort.py @@ -241,7 +241,7 @@ def annotate_cohort( if config.reference_path('seqr_combined_reference_optional'): refavis_ht = hl.read_table(config.reference_path('seqr_combined_reference_optional')) loguru.logger.info('Annotating with refavis data') - mt = mt.annotate_rows(avis = (refavis_ht[mt.row_key].avis,)) + mt = mt.annotate_rows(avis=(refavis_ht[mt.row_key].avis,)) # annotate all the gnomAD v4 fields in a separate function mt = annotate_gnomad4(mt) From 33eac5b6f1e0a1040f12d3ad285b4295c978aa19 Mon Sep 17 00:00:00 2001 From: Johnnyassaf <117962983+Johnnyassaf@users.noreply.github.com> Date: Tue, 10 Feb 2026 15:11:48 +1100 Subject: [PATCH 04/10] Adding a walrus operator to preempt a empty field exception for an optional annotation in annotate_cohort Co-authored-by: Matt Welland --- src/cpg_seqr_loader/scripts/annotate_cohort.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cpg_seqr_loader/scripts/annotate_cohort.py b/src/cpg_seqr_loader/scripts/annotate_cohort.py index a6fdfbd..b581e04 100644 --- a/src/cpg_seqr_loader/scripts/annotate_cohort.py +++ b/src/cpg_seqr_loader/scripts/annotate_cohort.py @@ -238,8 +238,8 @@ def annotate_cohort( clinvar_data=clinvar_ht[mt.row_key], ref_data=ref_ht[mt.row_key], ) - if config.reference_path('seqr_combined_reference_optional'): - refavis_ht = hl.read_table(config.reference_path('seqr_combined_reference_optional')) + if avi_table := config.reference_path('avi_table', None): + refavis_ht = hl.read_table(avi_table) loguru.logger.info('Annotating with refavis data') mt = mt.annotate_rows(avis=(refavis_ht[mt.row_key].avis,)) From b20d05b273581ab372ba3d4f8eede86138e864da Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Wed, 11 Feb 2026 15:28:54 +1100 Subject: [PATCH 05/10] bugfix --- src/cpg_seqr_loader/scripts/annotate_cohort.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/cpg_seqr_loader/scripts/annotate_cohort.py b/src/cpg_seqr_loader/scripts/annotate_cohort.py index b581e04..c572327 100644 --- a/src/cpg_seqr_loader/scripts/annotate_cohort.py +++ b/src/cpg_seqr_loader/scripts/annotate_cohort.py @@ -238,10 +238,11 @@ def annotate_cohort( clinvar_data=clinvar_ht[mt.row_key], ref_data=ref_ht[mt.row_key], ) - if avi_table := config.reference_path('avi_table', None): + if avi_table := config.config_retrieve(['references','avi_table'], None): refavis_ht = hl.read_table(avi_table) loguru.logger.info('Annotating with refavis data') mt = mt.annotate_rows(avis=(refavis_ht[mt.row_key].avis,)) + mt.describe() # annotate all the gnomAD v4 fields in a separate function mt = annotate_gnomad4(mt) From 6300d00ad4dae45f8a9ced6156cb8a4e7d74112a Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Wed, 11 Feb 2026 17:52:52 +1100 Subject: [PATCH 06/10] bugfix --- src/cpg_seqr_loader/config_template.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/cpg_seqr_loader/config_template.toml b/src/cpg_seqr_loader/config_template.toml index 9483aae..8cc5fc8 100644 --- a/src/cpg_seqr_loader/config_template.toml +++ b/src/cpg_seqr_loader/config_template.toml @@ -27,9 +27,9 @@ force_new_combiner = false # highem, standard, or a string, e.g. "4Gi" driver_memory = "highmem" # string, e.g. "4Gi" -driver_storage = "10Gi" +driver_storage = "55Gi" # integer -driver_cores = 2 +driver_cores = 4 # highem, standard, or a string, e.g. "4Gi" worker_memory = "highmem" From 6e439758609d394a13f0119f10e2e4a5b6fa55c5 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Mon, 16 Feb 2026 14:08:47 +1100 Subject: [PATCH 07/10] fixing_reference --- src/cpg_seqr_loader/scripts/annotate_cohort.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpg_seqr_loader/scripts/annotate_cohort.py b/src/cpg_seqr_loader/scripts/annotate_cohort.py index c572327..41ea611 100644 --- a/src/cpg_seqr_loader/scripts/annotate_cohort.py +++ b/src/cpg_seqr_loader/scripts/annotate_cohort.py @@ -241,7 +241,7 @@ def annotate_cohort( if avi_table := config.config_retrieve(['references','avi_table'], None): refavis_ht = hl.read_table(avi_table) loguru.logger.info('Annotating with refavis data') - mt = mt.annotate_rows(avis=(refavis_ht[mt.row_key].avis,)) + mt = mt.annotate_rows(avis=(refavis_ht[mt.row_key].normalised_avis,)) mt.describe() # annotate all the gnomAD v4 fields in a separate function From f5d2caf71efc672c7ee806f27772456db4c17b50 Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Mon, 16 Feb 2026 15:47:27 +1100 Subject: [PATCH 08/10] linting --- src/cpg_seqr_loader/config_template.toml | 8 +++----- src/cpg_seqr_loader/scripts/annotate_cohort.py | 2 +- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/src/cpg_seqr_loader/config_template.toml b/src/cpg_seqr_loader/config_template.toml index 8cc5fc8..b2bc7bb 100644 --- a/src/cpg_seqr_loader/config_template.toml +++ b/src/cpg_seqr_loader/config_template.toml @@ -1,7 +1,6 @@ [workflow] name = 'seqr_loader' - # used to make sure we don't repeat previously completed stages check_expected_outputs = true @@ -27,9 +26,9 @@ force_new_combiner = false # highem, standard, or a string, e.g. "4Gi" driver_memory = "highmem" # string, e.g. "4Gi" -driver_storage = "55Gi" +driver_storage = "10Gi" # integer -driver_cores = 4 +driver_cores = 8 # highem, standard, or a string, e.g. "4Gi" worker_memory = "highmem" @@ -49,7 +48,7 @@ worker_memory = "highmem" # highem, standard, or a string, e.g. "4Gi" driver_memory = "highmem" # integer -driver_cores = 2 +driver_cores = 4 [vcf_from_mt] # highem, standard, or a string, e.g. "4Gi" @@ -84,7 +83,6 @@ liftover_38_to_37 = "gs://cpg-common-main/references/liftover/grch38_to_grch37.o seqr_clinvar = "gs://cpg-common-main/references/seqr/v0/clinvar.GRCh38.ht" seqr_combined_reference_data = "gs://cpg-common-main/references/seqr/v0/combined_reference_data_grch38.ht" vep_mount = "gs://cpg-common-main/references/vep/110/mount" - # these are all related to VQSR axiom_poly_vcf = "gs://cpg-common-main/references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz" axiom_poly_vcf_index = "gs://cpg-common-main/references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi" diff --git a/src/cpg_seqr_loader/scripts/annotate_cohort.py b/src/cpg_seqr_loader/scripts/annotate_cohort.py index 41ea611..2dfd58a 100644 --- a/src/cpg_seqr_loader/scripts/annotate_cohort.py +++ b/src/cpg_seqr_loader/scripts/annotate_cohort.py @@ -238,7 +238,7 @@ def annotate_cohort( clinvar_data=clinvar_ht[mt.row_key], ref_data=ref_ht[mt.row_key], ) - if avi_table := config.config_retrieve(['references','avi_table'], None): + if avi_table := config.config_retrieve(['references', 'avi_table'], None): refavis_ht = hl.read_table(avi_table) loguru.logger.info('Annotating with refavis data') mt = mt.annotate_rows(avis=(refavis_ht[mt.row_key].normalised_avis,)) From 920f49e98114fe6a4c55ab59e4515b1ad3771027 Mon Sep 17 00:00:00 2001 From: Johnnyassaf <117962983+Johnnyassaf@users.noreply.github.com> Date: Mon, 16 Feb 2026 16:40:41 +1100 Subject: [PATCH 09/10] Update src/cpg_seqr_loader/config_template.toml remove unnecessary elements Co-authored-by: Matt Welland --- src/cpg_seqr_loader/config_template.toml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/cpg_seqr_loader/config_template.toml b/src/cpg_seqr_loader/config_template.toml index 5b67974..14c7677 100644 --- a/src/cpg_seqr_loader/config_template.toml +++ b/src/cpg_seqr_loader/config_template.toml @@ -1,9 +1,5 @@ [workflow] -name = 'seqr_loader' -# used to make sure we don't repeat previously completed stages -check_expected_outputs = true - # the method to register outputs, can be missing - will not generate metamist analysis entries status_reporter = 'metamist' From 7e2c0cadd23c6918be0f94f8cae80e13de594d2c Mon Sep 17 00:00:00 2001 From: Johnnyassaf Date: Mon, 16 Feb 2026 16:57:17 +1100 Subject: [PATCH 10/10] =?UTF-8?q?Bump=20version:=200.1.16=20=E2=86=92=200.?= =?UTF-8?q?1.17?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Dockerfile | 2 +- README.md | 4 ++-- pyproject.toml | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/Dockerfile b/Dockerfile index 0b2492c..d9b0710 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,7 +1,7 @@ FROM australia-southeast1-docker.pkg.dev/cpg-common/images/cpg_hail_gcloud:0.2.137.cpg1-2 ENV PYTHONDONTWRITEBYTECODE=1 -ENV VERSION=0.1.16 +ENV VERSION=0.1.17 WORKDIR /cpg_seqr_loader diff --git a/README.md b/README.md index 5d71eb4..7d84af6 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ CPG-Flow workflows are operated entirely by defining input Cohorts (see [here](h ```bash analysis-runner \ --skip-repo-checkout \ - --image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.16 \ + --image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.17 \ --config src/cpg_seqr_loader/config_template.toml \ --config cohorts.toml \ # containing the inputs_cohorts and sequencing_type --dataset seqr \ @@ -70,7 +70,7 @@ analysis-runner \ ```bash analysis-runner \ --skip-repo-checkout \ - --image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.16 \ + --image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.17 \ --config src/cpg_seqr_loader/config_template.toml \ --config cohorts.toml \ # containing the inputs_cohorts and sequencing_type --dataset seqr \ diff --git a/pyproject.toml b/pyproject.toml index 5e10e13..94c34a8 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ description='Seqr-Loader (gVCF-combiner) implemented in CPG-Flow' readme = "README.md" # currently cpg-flow is pinned to this version requires-python = ">=3.10,<3.12" -version="0.1.16" +version="0.1.17" license={"file" = "LICENSE"} classifiers=[ 'Environment :: Console', @@ -120,7 +120,7 @@ hail = ["hail"] "src/cpg_seqr_loader/scripts/annotate_cohort.py" = ["E501"] [tool.bumpversion] -current_version = "0.1.16" +current_version = "0.1.17" parse = "(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)" serialize = ["{major}.{minor}.{patch}"] commit = true