diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 87e9861..accc503 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -15,7 +15,7 @@ on: permissions: {} env: - VERSION: 0.1.7 + VERSION: 0.1.8 IMAGE_NAME: cpg-flow-seqr-loader DOCKER_DEV: australia-southeast1-docker.pkg.dev/cpg-common/images-dev DOCKER_MAIN: australia-southeast1-docker.pkg.dev/cpg-common/images diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5956b3f..951ac1f 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,7 +17,7 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: v0.11.11 + rev: v0.12.11 hooks: - id: ruff - id: ruff-format diff --git a/README.md b/README.md index 604739d..d272abf 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ CPG-Flow workflows are operated entirely by defining input Cohorts (see [here](h ```bash analysis-runner \ --skip-repo-checkout \ - --image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.7 \ + --image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.8 \ --config src/cpg_seqr_loader/config_template.toml \ --config cohorts.toml \ # containing the inputs_cohorts and sequencing_type --dataset seqr \ @@ -70,7 +70,7 @@ analysis-runner \ ```bash analysis-runner \ --skip-repo-checkout \ - --image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.7 \ + --image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.8 \ --config src/cpg_seqr_loader/config_template.toml \ --config cohorts.toml \ # containing the inputs_cohorts and sequencing_type --dataset seqr \ diff --git a/pyproject.toml b/pyproject.toml index faf2fbd..02f2517 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ description='Seqr-Loader (gVCF-combiner) implemented in CPG-Flow' readme = "README.md" # currently cpg-flow is pinned to this version requires-python = ">=3.10,<3.11" -version="0.1.7" +version="0.1.8" license={"file" = "LICENSE"} classifiers=[ 'Environment :: Console', @@ -122,7 +122,7 @@ hail = ["hail"] "src/cpg_seqr_loader/scripts/annotate_cohort.py" = ["E501"] [tool.bumpversion] -current_version = "0.1.7" +current_version = "0.1.8" parse = "(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)" serialize = ["{major}.{minor}.{patch}"] commit = true diff --git a/src/cpg_seqr_loader/config_template.toml b/src/cpg_seqr_loader/config_template.toml index cb8cc93..119335d 100644 --- a/src/cpg_seqr_loader/config_template.toml +++ b/src/cpg_seqr_loader/config_template.toml @@ -66,11 +66,25 @@ snps_recal_disc_size = 20 snps_gather_disc_size = 10 [images] -bcftools = "australia-southeast1-docker.pkg.dev/cpg-common/images/bcftools_120:1.20" -gatk = "australia-southeast1-docker.pkg.dev/cpg-common/images/gatk:4.2.6.1" -vep = "australia-southeast1-docker.pkg.dev/cpg-common/images/vep_110:release_110.1" +bcftools = "australia-southeast1-docker.pkg.dev/cpg-common/images/bcftools:1.22-1" +gatk = "australia-southeast1-docker.pkg.dev/cpg-common/images/gatk:4.6.2.0-1" +vep = "australia-southeast1-docker.pkg.dev/cpg-common/images/vep:110.1-1" [references] +liftover_38_to_37 = "gs://cpg-common-main/references/liftover/grch38_to_grch37.over.chain.gz" +hapmap_vcf = "gs://cpg-common-main/references/hg38/v0/hapmap_3.3.hg38.vcf.gz" +hapmap_vcf_index = "gs://cpg-common-main/references/hg38/v0/hapmap_3.3.hg38.vcf.gz.tbi" +omni_vcf = "gs://cpg-common-main/references/hg38/v0/1000G_omni2.5.hg38.vcf.gz" +omni_vcf_index = "gs://cpg-common-main/references/hg38/v0/1000G_omni2.5.hg38.vcf.gz.tbi" +one_thousand_genomes_vcf = "gs://cpg-common-main/references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz" +one_thousand_genomes_vcf_index = "gs://cpg-common-main/references/hg38/v0/1000G_phase1.snps.high_confidence.hg38.vcf.gz.tbi" +mills_vcf = "gs://cpg-common-main/references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz" +mills_vcf_index = "gs://cpg-common-main/references/hg38/v0/Mills_and_1000G_gold_standard.indels.hg38.vcf.gz.tbi" +axiom_poly_vcf = "gs://cpg-common-main/references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz" +axiom_poly_vcf_index = "gs://cpg-common-main/references/hg38/v0/Axiom_Exome_Plus.genotypes.all_populations.poly.hg38.vcf.gz.tbi" +"gnomad_4.1_joint_ht" = "gs://cpg-common-main/references/gnomad/v4.1/joint/ht/gnomad.joint.v4.1.sites.ht" +seqr_clinvar = "gs://cpg-common-main/references/seqr/v0/clinvar.GRCh38.ht" +seqr_combined_reference_data = "gs://cpg-common-main/references/seqr/v0/combined_reference_data_grch38.ht" vep_mount = "gs://cpg-common-main/references/vep/110/mount" [elasticsearch] diff --git a/src/cpg_seqr_loader/scripts/annotate_cohort.py b/src/cpg_seqr_loader/scripts/annotate_cohort.py index ca04e7d..0457ef2 100644 --- a/src/cpg_seqr_loader/scripts/annotate_cohort.py +++ b/src/cpg_seqr_loader/scripts/annotate_cohort.py @@ -122,7 +122,7 @@ def annotate_gnomad4(mt: hl.MatrixTable) -> hl.MatrixTable: same MT, with gnomAD 4 annotations placed into the INFO struct as a nested Struct """ - gnomad4_ht = hl.read_table(config.reference_path('gnomad_4.1_joint_ht')) + gnomad4_ht = hl.read_table(config.config_retrieve(['references', 'gnomad_4.1_joint_ht'])) # the index of the target populations in the joint.freq array target_index = hl.eval(gnomad4_ht.globals.joint_globals.freq_index_dict[GNOMAD_TARGET_POP]) @@ -151,19 +151,7 @@ def annotate_cohort( checkpoint_prefix: str, vqsr_vcf_path: str | None = None, ) -> None: - """ - Convert VCF to matrix table, annotate for Seqr Loader, add VEP and VQSR annotations. - - Args: - mt_path (): - out_mt_path (): - vep_ht_path (): - checkpoint_prefix (): - vqsr_vcf_path (): - - Returns: - Nothing, but hopefully writes out a new MT - """ + """Convert VCF to matrix table, annotate for Seqr Loader, add VEP and VQSR annotations.""" hail_batch.init_batch( worker_memory=config.config_retrieve(['combiner', 'worker_memory']), @@ -193,8 +181,8 @@ def annotate_cohort( ) mt = mt.checkpoint(output=join(checkpoint_prefix, 'mt_vep_vqsr.mt'), overwrite=True) - ref_ht = hl.read_table(config.reference_path('seqr_combined_reference_data')) - clinvar_ht = hl.read_table(config.reference_path('seqr_clinvar')) + ref_ht = hl.read_table(config.config_retrieve(['references', 'seqr_combined_reference_data'])) + clinvar_ht = hl.read_table(config.config_retrieve(['references', 'seqr_clinvar'])) mt = hl.variant_qc(mt) mt = mt.annotate_rows( @@ -244,7 +232,7 @@ def annotate_cohort( # this was previously executed in the MtToEs job, as it wasn't possible on QoB loguru.logger.info('Adding GRCh37 coords') - liftover_path = config.reference_path('liftover_38_to_37') + liftover_path = config.config_retrieve(['references', 'liftover_38_to_37']) rg37 = hl.get_reference('GRCh37') rg38 = hl.get_reference('GRCh38') rg38.add_liftover(liftover_path, rg37) @@ -269,10 +257,7 @@ def annotate_cohort( cadd=mt.ref_data.cadd, dbnsfp=mt.ref_data.dbnsfp, geno2mp=mt.ref_data.geno2mp, - gnomad_exomes=mt.ref_data.gnomad_exomes, - gnomad_exome_coverage=mt.ref_data.gnomad_exome_coverage, - gnomad_genomes=mt.ref_data.gnomad_genomes, - gnomad_genome_coverage=mt.ref_data.gnomad_genome_coverage, + # we previously took the gnomAD data from the ref_data object, but now we're trying to move to gnomAD only eigen=mt.ref_data.eigen, exac=mt.ref_data.exac, g1k=mt.ref_data.g1k, diff --git a/src/cpg_seqr_loader/utils.py b/src/cpg_seqr_loader/utils.py index 697e982..c0cbc50 100644 --- a/src/cpg_seqr_loader/utils.py +++ b/src/cpg_seqr_loader/utils.py @@ -203,8 +203,8 @@ def get_localised_resources_for_vqsr() -> dict[str, 'ResourceGroup']: return { key: hail_batch.get_batch().read_input_group( - base=config.reference_path(f'broad/{key}_vcf'), - index=config.reference_path(f'broad/{key}_vcf_index'), + base=config.config_retrieve(['references', f'{key}_vcf']), + index=config.config_retrieve(['references', f'{key}_vcf_index']), ) for key in [ 'axiom_poly',