Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ on:
permissions: {}

env:
VERSION: 0.1.6
VERSION: 0.1.7
IMAGE_NAME: cpg-flow-seqr-loader
DOCKER_DEV: australia-southeast1-docker.pkg.dev/cpg-common/images-dev
DOCKER_MAIN: australia-southeast1-docker.pkg.dev/cpg-common/images
Expand Down
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ repos:

- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: v0.11.11
rev: v0.12.8
hooks:
- id: ruff
- id: ruff-format
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ CPG-Flow workflows are operated entirely by defining input Cohorts (see [here](h
```bash
analysis-runner \
--skip-repo-checkout \
--image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.6 \
--image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.7 \
--config src/cpg_seqr_loader/config_template.toml \
--config cohorts.toml \ # containing the inputs_cohorts and sequencing_type
--dataset seqr \
Expand All @@ -70,7 +70,7 @@ analysis-runner \
```bash
analysis-runner \
--skip-repo-checkout \
--image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.6 \
--image australia-southeast1-docker.pkg.dev/cpg-common/images/cpg-flow-seqr-loader:0.1.7 \
--config src/cpg_seqr_loader/config_template.toml \
--config cohorts.toml \ # containing the inputs_cohorts and sequencing_type
--dataset seqr \
Expand Down
18 changes: 2 additions & 16 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ description='Seqr-Loader (gVCF-combiner) implemented in CPG-Flow'
readme = "README.md"
# currently cpg-flow is pinned to this version
requires-python = ">=3.10,<3.11"
version="0.1.6"
version="0.1.7"
license={"file" = "LICENSE"}
classifiers=[
'Environment :: Console',
Expand Down Expand Up @@ -54,19 +54,6 @@ include_package_data = true
[options.package_data]
'cpg_seqr_loader'= ['config_template.toml']

[tool.black]
line-length = 120
skip-string-normalization = true
exclude = '''
/(
venv
| \.mypy_cache
| \.venv
| build
| dist
)/
'''

[tool.mypy]
ignore_missing_imports = true

Expand Down Expand Up @@ -113,7 +100,6 @@ section-order = ["future", "standard-library", "third-party", "hail", "cpg", "fi
cpg = ["cpg_seqr_loader", "talos", "cpg-flow", "cpg-utils"]
hail = ["hail"]


[tool.ruff.lint.per-file-ignores]
# suppress the ARG002 "Unused method argument" warning in the stages.py file
## - we don't need generic cpg-flow arguments for every Stage, but need to fit the required method signature
Expand All @@ -122,7 +108,7 @@ hail = ["hail"]
"src/cpg_seqr_loader/scripts/annotate_cohort.py" = ["E501"]

[tool.bumpversion]
current_version = "0.1.6"
current_version = "0.1.7"
parse = "(?P<major>\\d+)\\.(?P<minor>\\d+)\\.(?P<patch>\\d+)"
serialize = ["{major}.{minor}.{patch}"]
commit = true
Expand Down
4 changes: 4 additions & 0 deletions src/cpg_seqr_loader/config_template.toml
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,10 @@ vep = "australia-southeast1-docker.pkg.dev/cpg-common/images/vep_110:release_110
[references]
vep_mount = "gs://cpg-common-main/references/vep/110/mount"

[ourdna]
exome = "gs://cpg-common-main/TEMP_ourdna_frequencies/1.3/exome/frequencies.ht"
genome = "gs://cpg-common-main/TEMP_ourdna_frequencies/1.4/genome/frequencies.ht"

[elasticsearch]
# Configure access to ElasticSearch server
port = '9243'
Expand Down
142 changes: 83 additions & 59 deletions src/cpg_seqr_loader/scripts/annotate_cohort.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,26 @@ def load_vqsr(vcf_path: str, ht_path: Path) -> hl.Table:
return vqsr_ht.checkpoint(str(ht_path), overwrite=True)


def annotate_ourdna(mt: hl.MatrixTable) -> hl.MatrixTable:
"""Annotate the MatrixTable with OurDNA data (exomes & genomes)."""
if not (
(ourdna_exome_ht_path := config.config_retrieve(['ourdna', 'exome'], None))
and (ourdna_genome_ht_path := config.config_retrieve(['ourdna', 'genome'], None))
):
loguru.logger.info('One or both of the OurDNA HTs are not configured, skipping annotation')
return mt

ourdna_exomes = hl.read_table(ourdna_exome_ht_path)
ourdna_genomes = hl.read_table(ourdna_genome_ht_path)

# the ht.freq block contains a list of Structs, each AC/AF/AN/homozygote_count
# within this list of structs, the first element is the 'adj' adjusted (qc pass) population, which is what we want
return mt.annotate_rows(
ourdna_genomes=ourdna_genomes[mt.row_key].freq[0],
ourdna_exomes=ourdna_exomes[mt.row_key].freq[0],
)


def annotate_gnomad4(mt: hl.MatrixTable) -> hl.MatrixTable:
"""
All steps relating to the annotation of gnomAD v4(.1) data
Expand Down Expand Up @@ -144,26 +164,46 @@ def annotate_gnomad4(mt: hl.MatrixTable) -> hl.MatrixTable:
)


def annotate_with_external_sources(
mt: hl.MatrixTable,
vep_ht_path: str,
) -> hl.MatrixTable:
"""
Annotate MatrixTable with external Tables
"""

ref_ht_path = config.config_retrieve(['references', 'seqr_combined_reference_data'])
clinvar_ht_path = config.config_retrieve(['references', 'seqr_clinvar'])

loguru.logger.info(
f"""
Annotating Variant Matrix Table with additional datasets:
\tVEP annotations from {vep_ht_path}
\tReference data from {ref_ht_path}
\tClinVar data from {clinvar_ht_path}
""",
)

clinvar_ht = hl.read_table(clinvar_ht_path)
ref_ht = hl.read_table(ref_ht_path)
vep_ht = hl.read_table(vep_ht_path)

# join all annotation sources into the MatrixTable
return mt.annotate_rows(
clinvar_data=clinvar_ht[mt.row_key],
ref_data=ref_ht[mt.row_key],
vep=vep_ht[mt.row_key].vep,
)


def annotate_cohort(
mt_path: str,
out_mt_path: str,
vep_ht_path: str,
checkpoint_prefix: str,
vqsr_vcf_path: str | None = None,
) -> None:
"""
Convert VCF to matrix table, annotate for Seqr Loader, add VEP and VQSR annotations.

Args:
mt_path ():
out_mt_path ():
vep_ht_path ():
checkpoint_prefix ():
vqsr_vcf_path ():

Returns:
Nothing, but hopefully writes out a new MT
"""
"""Convert VCF to matrix table, annotate for Seqr Loader, add VEP, OurDNA, and VQSR annotations."""

hail_batch.init_batch(
worker_memory=config.config_retrieve(['combiner', 'worker_memory']),
Expand All @@ -174,13 +214,19 @@ def annotate_cohort(
mt = hl.read_matrix_table(mt_path)
loguru.logger.info(f'Imported MT from {mt_path} as {mt.n_partitions()} partitions')

# Annotate VEP. Do it before splitting multi, because we run VEP on unsplit VCF,
# and hl.split_multi_hts can handle multiallelic VEP field.
vep_ht = hl.read_table(vep_ht_path)
loguru.logger.info(f'Adding VEP annotations into the Matrix Table from {vep_ht_path}')
loguru.logger.info(f'VEP loaded as {vep_ht.n_partitions()} partitions')

mt = mt.checkpoint(output=join(checkpoint_prefix, 'mt_vep.mt'), overwrite=True)
sequencing_type = config.config_retrieve(['workflow', 'sequencing_type'])
# Map sequencing type to Seqr-style string in global variables
# https://github.com/broadinstitute/seqr/blob/e0c179c36c0f68c892017de5eab2e4c1b9ffdc92/seqr/models.py#L592-L594
mt = mt.annotate_globals(
sourceFilePath=mt_path,
genomeVersion=hail_batch.genome_build().replace('GRCh', ''),
hail_version=hl.version(),
sampleType={
'genome': 'WGS',
'exome': 'WES',
'single_cell': 'RNA',
}.get(sequencing_type, ''),
)

if vqsr_vcf_path:
loguru.logger.info('Adding VQSR annotations into the Matrix Table')
Expand All @@ -193,35 +239,33 @@ def annotate_cohort(
)
mt = mt.checkpoint(output=join(checkpoint_prefix, 'mt_vep_vqsr.mt'), overwrite=True)

ref_ht = hl.read_table(config.reference_path('seqr_combined_reference_data'))
clinvar_ht = hl.read_table(config.reference_path('seqr_clinvar'))
# join all annotation sources into the MatrixTable
mt = annotate_with_external_sources(mt, vep_ht_path)
mt = annotate_ourdna(mt)

# # annotate all the gnomAD v4 fields in a separate function
mt = annotate_gnomad4(mt)

mt = mt.checkpoint(output=join(checkpoint_prefix, 'mt_plus_ext_tables.mt'), overwrite=True)

# update AF attributes from observed callset frequencies
mt = hl.variant_qc(mt)
mt = mt.annotate_rows(
# annotate the info struct with the variant QC data - Hail's variant_qc includes the REF allele, so skip it
info=mt.info.annotate(
AF=mt.variant_qc.AF,
AC=[mt.variant_qc.AC[1]],
AF=[mt.variant_qc.AF[1]],
AN=mt.variant_qc.AN,
AC=mt.variant_qc.AC,
),
vep=vep_ht[mt.row_key].vep,
# taking a single value here for downstream compatibility in Seqr
AC=mt.variant_qc.AC[1],
AF=mt.variant_qc.AF[1],
AN=mt.variant_qc.AN,
)
mt = mt.drop('variant_qc')

# split the AC/AF attributes into separate entries, overwriting the array in INFO
# these elements become a 1-element array for [ALT] instead [REF, ALT]
mt = mt.annotate_rows(
info=mt.info.annotate(
AF=[mt.info.AF[1]],
AC=[mt.info.AC[1]],
),
)

loguru.logger.info('Annotating with clinvar and munging annotation fields')
loguru.logger.info('Reformatting annotated fields')
mt = mt.annotate_rows(
# still taking just a single value here for downstream compatibility in Seqr
AC=mt.info.AC[0],
AF=mt.info.AF[0],
AN=mt.info.AN,
aIndex=mt.a_index,
wasSplit=mt.was_split,
sortedTranscriptConsequences=vep.get_expr_for_vep_sorted_transcript_consequences_array(mt.vep),
Expand All @@ -235,13 +279,8 @@ def annotate_cohort(
xpos=variant_id.get_expr_for_xpos(mt.locus),
xstart=variant_id.get_expr_for_xpos(mt.locus),
xstop=variant_id.get_expr_for_xpos(mt.locus) + hl.len(mt.alleles[0]) - 1,
clinvar_data=clinvar_ht[mt.row_key],
ref_data=ref_ht[mt.row_key],
)

# annotate all the gnomAD v4 fields in a separate function
mt = annotate_gnomad4(mt)

# this was previously executed in the MtToEs job, as it wasn't possible on QoB
loguru.logger.info('Adding GRCh37 coords')
liftover_path = config.reference_path('liftover_38_to_37')
Expand Down Expand Up @@ -285,21 +324,6 @@ def annotate_cohort(
gold_stars=mt.clinvar_data.gold_stars,
),
)
mt = mt.annotate_globals(
sourceFilePath=mt_path,
genomeVersion=hail_batch.genome_build().replace('GRCh', ''),
hail_version=hl.version(),
)
if sequencing_type := config.config_retrieve(['workflow', 'sequencing_type']):
# Map to Seqr-style string
# https://github.com/broadinstitute/seqr/blob/e0c179c36c0f68c892017de5eab2e4c1b9ffdc92/seqr/models.py#L592-L594
mt = mt.annotate_globals(
sampleType={
'genome': 'WGS',
'exome': 'WES',
'single_cell': 'RNA',
}.get(sequencing_type, ''),
)

loguru.logger.info('Final Structure:')
mt.write(out_mt_path, overwrite=True)
Expand Down