Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ on:
permissions: {}

env:
VERSION: 0.1.7
VERSION: 0.1.8
IMAGE_NAME: cpg-flow-seqr-loader
DOCKER_DEV: australia-southeast1-docker.pkg.dev/cpg-common/images-dev
DOCKER_MAIN: australia-southeast1-docker.pkg.dev/cpg-common/images
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ repos:

- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.11.11
rev: v0.12.11
hooks:
- id: ruff
- id: ruff-format
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ CPG-Flow workflows are operated entirely by defining input Cohorts (see [here](h
```bash
analysis-runner \
--skip-repo-checkout \
--image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.7 \
--image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.8 \
--config src/cpg_seqr_loader/config_template.toml \
--config cohorts.toml \ # containing the inputs_cohorts and sequencing_type
--dataset seqr \
Expand All @@ -70,7 +70,7 @@ analysis-runner \
```bash
analysis-runner \
--skip-repo-checkout \
--image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.7 \
--image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.8 \
--config src/cpg_seqr_loader/config_template.toml \
--config cohorts.toml \ # containing the inputs_cohorts and sequencing_type
--dataset seqr \
Expand Down
4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ description='Seqr-Loader (gVCF-combiner) implemented in CPG-Flow'
readme = "README.md"
# currently cpg-flow is pinned to this version
requires-python = ">=3.10,<3.11"
version="0.1.7"
version="0.1.8"
license={"file" = "LICENSE"}
classifiers=[
'Environment :: Console',
Expand Down Expand Up @@ -122,7 +122,7 @@ hail = ["hail"]
"src/cpg_seqr_loader/scripts/annotate_cohort.py" = ["E501"]

[tool.bumpversion]
current_version = "0.1.7"
current_version = "0.1.8"
parse = "(?P<major>\\d+)\\.(?P<minor>\\d+)\\.(?P<patch>\\d+)"
serialize = ["{major}.{minor}.{patch}"]
commit = true
Expand Down
20 changes: 17 additions & 3 deletions src/cpg_seqr_loader/config_template.toml
Original file line number Diff line number Diff line change
Expand Up @@ -66,11 +66,25 @@ snps_recal_disc_size = 20
snps_gather_disc_size = 10

[images]
bcftools = "australia-southeast1-docker.pkg.dev/cpg-common/images/bcftools_120:1.20"
gatk = "australia-southeast1-docker.pkg.dev/cpg-common/images/gatk:4.2.6.1"
vep = "australia-southeast1-docker.pkg.dev/cpg-common/images/vep_110:release_110.1"
bcftools = "australia-southeast1-docker.pkg.dev/cpg-common/images/bcftools:1.22-1"
gatk = "australia-southeast1-docker.pkg.dev/cpg-common/images/gatk:4.6.2.0-1"
vep = "australia-southeast1-docker.pkg.dev/cpg-common/images/vep:110.1-1"

[references]
liftover_38_to_37 = "gs://cpg-common-main/references/liftover/grch38_to_grch37.over.chain.gz"
hapmap_vcf = "gs://cpg-common-main/references/hg38/v0/hapmap_3.3.hg38.vcf.gz"
hapmap_vcf_index = "gs://cpg-common-main/references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi"
omni_vcf = "gs://cpg-common-main/references/hg38/v0/1000G_omni2.5.hg38.vcf.gz"
omni_vcf_index = "gs://cpg-common-main/references/hg38/v0/1000G_omni2.5.hg38.vcf.gz.tbi"
one_thousand_genomes_vcf = "gs://cpg-common-main/references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz"
one_thousand_genomes_vcf_index = "gs://cpg-common-main/references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi"
mills_vcf = "gs://cpg-common-main/references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz"
mills_vcf_index = "gs://cpg-common-main/references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi"
axiom_poly_vcf = "gs://cpg-common-main/references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz"
axiom_poly_vcf_index = "gs://cpg-common-main/references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi"
"gnomad_4.1_joint_ht" = "gs://cpg-common-main/references/gnomad/v4.1/joint/ht/gnomad.joint.v4.1.sites.ht"
seqr_clinvar = "gs://cpg-common-main/references/seqr/v0/clinvar.GRCh38.ht"
seqr_combined_reference_data = "gs://cpg-common-main/references/seqr/v0/combined_reference_data_grch38.ht"
vep_mount = "gs://cpg-common-main/references/vep/110/mount"

[elasticsearch]
Expand Down
27 changes: 6 additions & 21 deletions src/cpg_seqr_loader/scripts/annotate_cohort.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,7 @@ def annotate_gnomad4(mt: hl.MatrixTable) -> hl.MatrixTable:
same MT, with gnomAD 4 annotations placed into the INFO struct as a nested Struct
"""

gnomad4_ht = hl.read_table(config.reference_path('gnomad_4.1_joint_ht'))
gnomad4_ht = hl.read_table(config.config_retrieve(['references', 'gnomad_4.1_joint_ht']))

# the index of the target populations in the joint.freq array
target_index = hl.eval(gnomad4_ht.globals.joint_globals.freq_index_dict[GNOMAD_TARGET_POP])
Expand Down Expand Up @@ -151,19 +151,7 @@ def annotate_cohort(
checkpoint_prefix: str,
vqsr_vcf_path: str | None = None,
) -> None:
"""
Convert VCF to matrix table, annotate for Seqr Loader, add VEP and VQSR annotations.

Args:
mt_path ():
out_mt_path ():
vep_ht_path ():
checkpoint_prefix ():
vqsr_vcf_path ():

Returns:
Nothing, but hopefully writes out a new MT
"""
"""Convert VCF to matrix table, annotate for Seqr Loader, add VEP and VQSR annotations."""

hail_batch.init_batch(
worker_memory=config.config_retrieve(['combiner', 'worker_memory']),
Expand Down Expand Up @@ -193,8 +181,8 @@ def annotate_cohort(
)
mt = mt.checkpoint(output=join(checkpoint_prefix, 'mt_vep_vqsr.mt'), overwrite=True)

ref_ht = hl.read_table(config.reference_path('seqr_combined_reference_data'))
clinvar_ht = hl.read_table(config.reference_path('seqr_clinvar'))
ref_ht = hl.read_table(config.config_retrieve(['references', 'seqr_combined_reference_data']))
clinvar_ht = hl.read_table(config.config_retrieve(['references', 'seqr_clinvar']))

mt = hl.variant_qc(mt)
mt = mt.annotate_rows(
Expand Down Expand Up @@ -244,7 +232,7 @@ def annotate_cohort(

# this was previously executed in the MtToEs job, as it wasn't possible on QoB
loguru.logger.info('Adding GRCh37 coords')
liftover_path = config.reference_path('liftover_38_to_37')
liftover_path = config.config_retrieve(['references', 'liftover_38_to_37'])
rg37 = hl.get_reference('GRCh37')
rg38 = hl.get_reference('GRCh38')
rg38.add_liftover(liftover_path, rg37)
Expand All @@ -269,10 +257,7 @@ def annotate_cohort(
cadd=mt.ref_data.cadd,
dbnsfp=mt.ref_data.dbnsfp,
geno2mp=mt.ref_data.geno2mp,
gnomad_exomes=mt.ref_data.gnomad_exomes,
gnomad_exome_coverage=mt.ref_data.gnomad_exome_coverage,
gnomad_genomes=mt.ref_data.gnomad_genomes,
gnomad_genome_coverage=mt.ref_data.gnomad_genome_coverage,
# we previously took the gnomAD data from the ref_data object, but now we're trying to move to gnomAD only
eigen=mt.ref_data.eigen,
exac=mt.ref_data.exac,
g1k=mt.ref_data.g1k,
Expand Down
4 changes: 2 additions & 2 deletions src/cpg_seqr_loader/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,8 +203,8 @@ def get_localised_resources_for_vqsr() -> dict[str, 'ResourceGroup']:

return {
key: hail_batch.get_batch().read_input_group(
base=config.reference_path(f'broad/{key}_vcf'),
index=config.reference_path(f'broad/{key}_vcf_index'),
base=config.config_retrieve(['references', f'{key}_vcf']),
index=config.config_retrieve(['references', f'{key}_vcf_index']),
)
for key in [
'axiom_poly',
Expand Down