diff --git a/.github/workflows/docker.yaml b/.github/workflows/docker.yaml index 2a8d2d0..87e9861 100644 --- a/.github/workflows/docker.yaml +++ b/.github/workflows/docker.yaml @@ -15,7 +15,7 @@ on: permissions: {} env: - VERSION: 0.1.6 + VERSION: 0.1.7 IMAGE_NAME: cpg-flow-seqr-loader DOCKER_DEV: australia-southeast1-docker.pkg.dev/cpg-common/images-dev DOCKER_MAIN: australia-southeast1-docker.pkg.dev/cpg-common/images diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 5956b3f..78a8022 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -17,7 +17,7 @@ repos: - repo: https://github.com/astral-sh/ruff-pre-commit # Ruff version. - rev: v0.11.11 + rev: v0.12.8 hooks: - id: ruff - id: ruff-format diff --git a/README.md b/README.md index 91ecc94..604739d 100644 --- a/README.md +++ b/README.md @@ -55,7 +55,7 @@ CPG-Flow workflows are operated entirely by defining input Cohorts (see [here](h ```bash analysis-runner \ --skip-repo-checkout \ - --image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.6 \ + --image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.7 \ --config src/cpg_seqr_loader/config_template.toml \ --config cohorts.toml \ # containing the inputs_cohorts and sequencing_type --dataset seqr \ @@ -70,7 +70,7 @@ analysis-runner \ ```bash analysis-runner \ --skip-repo-checkout \ - --image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.6 \ + --image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.7 \ --config src/cpg_seqr_loader/config_template.toml \ --config cohorts.toml \ # containing the inputs_cohorts and sequencing_type --dataset seqr \ diff --git a/pyproject.toml b/pyproject.toml index c9d7a5d..d02af75 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -8,7 +8,7 @@ description='Seqr-Loader (gVCF-combiner) implemented in CPG-Flow' readme = "README.md" # currently cpg-flow is pinned to this version requires-python = ">=3.10,<3.11" -version="0.1.6" +version="0.1.7" license={"file" = "LICENSE"} classifiers=[ 'Environment :: Console', @@ -54,19 +54,6 @@ include_package_data = true [options.package_data] 'cpg_seqr_loader'= ['config_template.toml'] -[tool.black] -line-length = 120 -skip-string-normalization = true -exclude = ''' -/( - venv - | \.mypy_cache - | \.venv - | build - | dist -)/ -''' - [tool.mypy] ignore_missing_imports = true @@ -113,7 +100,6 @@ section-order = ["future", "standard-library", "third-party", "hail", "cpg", "fi cpg = ["cpg_seqr_loader", "talos", "cpg-flow", "cpg-utils"] hail = ["hail"] - [tool.ruff.lint.per-file-ignores] # suppress the ARG002 "Unused method argument" warning in the stages.py file ## - we don't need generic cpg-flow arguments for every Stage, but need to fit the required method signature @@ -122,7 +108,7 @@ hail = ["hail"] "src/cpg_seqr_loader/scripts/annotate_cohort.py" = ["E501"] [tool.bumpversion] -current_version = "0.1.6" +current_version = "0.1.7" parse = "(?P\\d+)\\.(?P\\d+)\\.(?P\\d+)" serialize = ["{major}.{minor}.{patch}"] commit = true diff --git a/src/cpg_seqr_loader/config_template.toml b/src/cpg_seqr_loader/config_template.toml index a846e2c..52ce75a 100644 --- a/src/cpg_seqr_loader/config_template.toml +++ b/src/cpg_seqr_loader/config_template.toml @@ -65,6 +65,10 @@ vep = "australia-southeast1-docker.pkg.dev/cpg-common/images/vep_110:release_110 [references] vep_mount = "gs://cpg-common-main/references/vep/110/mount" +[ourdna] +exome = "gs://cpg-common-main/TEMP_ourdna_frequencies/1.3/exome/frequencies.ht" +genome = "gs://cpg-common-main/TEMP_ourdna_frequencies/1.4/genome/frequencies.ht" + [elasticsearch] # Configure access to ElasticSearch server port = '9243' diff --git a/src/cpg_seqr_loader/scripts/annotate_cohort.py b/src/cpg_seqr_loader/scripts/annotate_cohort.py index ca04e7d..3d02363 100644 --- a/src/cpg_seqr_loader/scripts/annotate_cohort.py +++ b/src/cpg_seqr_loader/scripts/annotate_cohort.py @@ -111,6 +111,26 @@ def load_vqsr(vcf_path: str, ht_path: Path) -> hl.Table: return vqsr_ht.checkpoint(str(ht_path), overwrite=True) +def annotate_ourdna(mt: hl.MatrixTable) -> hl.MatrixTable: + """Annotate the MatrixTable with OurDNA data (exomes & genomes).""" + if not ( + (ourdna_exome_ht_path := config.config_retrieve(['ourdna', 'exome'], None)) + and (ourdna_genome_ht_path := config.config_retrieve(['ourdna', 'genome'], None)) + ): + loguru.logger.info('One or both of the OurDNA HTs are not configured, skipping annotation') + return mt + + ourdna_exomes = hl.read_table(ourdna_exome_ht_path) + ourdna_genomes = hl.read_table(ourdna_genome_ht_path) + + # the ht.freq block contains a list of Structs, each AC/AF/AN/homozygote_count + # within this list of structs, the first element is the 'adj' adjusted (qc pass) population, which is what we want + return mt.annotate_rows( + ourdna_genomes=ourdna_genomes[mt.row_key].freq[0], + ourdna_exomes=ourdna_exomes[mt.row_key].freq[0], + ) + + def annotate_gnomad4(mt: hl.MatrixTable) -> hl.MatrixTable: """ All steps relating to the annotation of gnomAD v4(.1) data @@ -144,6 +164,38 @@ def annotate_gnomad4(mt: hl.MatrixTable) -> hl.MatrixTable: ) +def annotate_with_external_sources( + mt: hl.MatrixTable, + vep_ht_path: str, +) -> hl.MatrixTable: + """ + Annotate MatrixTable with external Tables + """ + + ref_ht_path = config.config_retrieve(['references', 'seqr_combined_reference_data']) + clinvar_ht_path = config.config_retrieve(['references', 'seqr_clinvar']) + + loguru.logger.info( + f""" + Annotating Variant Matrix Table with additional datasets: + \tVEP annotations from {vep_ht_path} + \tReference data from {ref_ht_path} + \tClinVar data from {clinvar_ht_path} + """, + ) + + clinvar_ht = hl.read_table(clinvar_ht_path) + ref_ht = hl.read_table(ref_ht_path) + vep_ht = hl.read_table(vep_ht_path) + + # join all annotation sources into the MatrixTable + return mt.annotate_rows( + clinvar_data=clinvar_ht[mt.row_key], + ref_data=ref_ht[mt.row_key], + vep=vep_ht[mt.row_key].vep, + ) + + def annotate_cohort( mt_path: str, out_mt_path: str, @@ -151,19 +203,7 @@ def annotate_cohort( checkpoint_prefix: str, vqsr_vcf_path: str | None = None, ) -> None: - """ - Convert VCF to matrix table, annotate for Seqr Loader, add VEP and VQSR annotations. - - Args: - mt_path (): - out_mt_path (): - vep_ht_path (): - checkpoint_prefix (): - vqsr_vcf_path (): - - Returns: - Nothing, but hopefully writes out a new MT - """ + """Convert VCF to matrix table, annotate for Seqr Loader, add VEP, OurDNA, and VQSR annotations.""" hail_batch.init_batch( worker_memory=config.config_retrieve(['combiner', 'worker_memory']), @@ -174,13 +214,19 @@ def annotate_cohort( mt = hl.read_matrix_table(mt_path) loguru.logger.info(f'Imported MT from {mt_path} as {mt.n_partitions()} partitions') - # Annotate VEP. Do it before splitting multi, because we run VEP on unsplit VCF, - # and hl.split_multi_hts can handle multiallelic VEP field. - vep_ht = hl.read_table(vep_ht_path) - loguru.logger.info(f'Adding VEP annotations into the Matrix Table from {vep_ht_path}') - loguru.logger.info(f'VEP loaded as {vep_ht.n_partitions()} partitions') - - mt = mt.checkpoint(output=join(checkpoint_prefix, 'mt_vep.mt'), overwrite=True) + sequencing_type = config.config_retrieve(['workflow', 'sequencing_type']) + # Map sequencing type to Seqr-style string in global variables + # https://github.com/broadinstitute/seqr/blob/e0c179c36c0f68c892017de5eab2e4c1b9ffdc92/seqr/models.py#L592-L594 + mt = mt.annotate_globals( + sourceFilePath=mt_path, + genomeVersion=hail_batch.genome_build().replace('GRCh', ''), + hail_version=hl.version(), + sampleType={ + 'genome': 'WGS', + 'exome': 'WES', + 'single_cell': 'RNA', + }.get(sequencing_type, ''), + ) if vqsr_vcf_path: loguru.logger.info('Adding VQSR annotations into the Matrix Table') @@ -193,35 +239,33 @@ def annotate_cohort( ) mt = mt.checkpoint(output=join(checkpoint_prefix, 'mt_vep_vqsr.mt'), overwrite=True) - ref_ht = hl.read_table(config.reference_path('seqr_combined_reference_data')) - clinvar_ht = hl.read_table(config.reference_path('seqr_clinvar')) + # join all annotation sources into the MatrixTable + mt = annotate_with_external_sources(mt, vep_ht_path) + mt = annotate_ourdna(mt) + # # annotate all the gnomAD v4 fields in a separate function + mt = annotate_gnomad4(mt) + + mt = mt.checkpoint(output=join(checkpoint_prefix, 'mt_plus_ext_tables.mt'), overwrite=True) + + # update AF attributes from observed callset frequencies mt = hl.variant_qc(mt) mt = mt.annotate_rows( + # annotate the info struct with the variant QC data - Hail's variant_qc includes the REF allele, so skip it info=mt.info.annotate( - AF=mt.variant_qc.AF, + AC=[mt.variant_qc.AC[1]], + AF=[mt.variant_qc.AF[1]], AN=mt.variant_qc.AN, - AC=mt.variant_qc.AC, ), - vep=vep_ht[mt.row_key].vep, + # taking a single value here for downstream compatibility in Seqr + AC=mt.variant_qc.AC[1], + AF=mt.variant_qc.AF[1], + AN=mt.variant_qc.AN, ) mt = mt.drop('variant_qc') - # split the AC/AF attributes into separate entries, overwriting the array in INFO - # these elements become a 1-element array for [ALT] instead [REF, ALT] - mt = mt.annotate_rows( - info=mt.info.annotate( - AF=[mt.info.AF[1]], - AC=[mt.info.AC[1]], - ), - ) - - loguru.logger.info('Annotating with clinvar and munging annotation fields') + loguru.logger.info('Reformatting annotated fields') mt = mt.annotate_rows( - # still taking just a single value here for downstream compatibility in Seqr - AC=mt.info.AC[0], - AF=mt.info.AF[0], - AN=mt.info.AN, aIndex=mt.a_index, wasSplit=mt.was_split, sortedTranscriptConsequences=vep.get_expr_for_vep_sorted_transcript_consequences_array(mt.vep), @@ -235,13 +279,8 @@ def annotate_cohort( xpos=variant_id.get_expr_for_xpos(mt.locus), xstart=variant_id.get_expr_for_xpos(mt.locus), xstop=variant_id.get_expr_for_xpos(mt.locus) + hl.len(mt.alleles[0]) - 1, - clinvar_data=clinvar_ht[mt.row_key], - ref_data=ref_ht[mt.row_key], ) - # annotate all the gnomAD v4 fields in a separate function - mt = annotate_gnomad4(mt) - # this was previously executed in the MtToEs job, as it wasn't possible on QoB loguru.logger.info('Adding GRCh37 coords') liftover_path = config.reference_path('liftover_38_to_37') @@ -285,21 +324,6 @@ def annotate_cohort( gold_stars=mt.clinvar_data.gold_stars, ), ) - mt = mt.annotate_globals( - sourceFilePath=mt_path, - genomeVersion=hail_batch.genome_build().replace('GRCh', ''), - hail_version=hl.version(), - ) - if sequencing_type := config.config_retrieve(['workflow', 'sequencing_type']): - # Map to Seqr-style string - # https://github.com/broadinstitute/seqr/blob/e0c179c36c0f68c892017de5eab2e4c1b9ffdc92/seqr/models.py#L592-L594 - mt = mt.annotate_globals( - sampleType={ - 'genome': 'WGS', - 'exome': 'WES', - 'single_cell': 'RNA', - }.get(sequencing_type, ''), - ) loguru.logger.info('Final Structure:') mt.write(out_mt_path, overwrite=True)