From 4f2b7faff89aba6317db2717c54072ce03a809d9 Mon Sep 17 00:00:00 2001 From: MattWellie Date: Fri, 9 Jan 2026 10:41:40 +1000 Subject: [PATCH 1/5] download the mitoreport annotation bundle once per month --- src/cpg_flow_mito/jobs/annotations_update.py | 14 ++++++ src/cpg_flow_mito/jobs/mito.py | 4 +- src/cpg_flow_mito/stages.py | 46 ++++++++++++++++++-- 3 files changed, 60 insertions(+), 4 deletions(-) create mode 100644 src/cpg_flow_mito/jobs/annotations_update.py diff --git a/src/cpg_flow_mito/jobs/annotations_update.py b/src/cpg_flow_mito/jobs/annotations_update.py new file mode 100644 index 0000000..82fd4d7 --- /dev/null +++ b/src/cpg_flow_mito/jobs/annotations_update.py @@ -0,0 +1,14 @@ + +from cpg_utils import Path, hail_batch, config + +def download_latest_annotations(output_path: Path, job_attrs: dict[str, str]): + """Trigger the MitoMap download, save to GCP.""" + + batch_instance = hail_batch.get_batch() + job = batch_instance.new_bash_job('Monthly annotation update', job_attrs | {'tool': 'mitoreport'}) + job.image(config.config_retrieve(['images', 'mitoreport'])) + + # noted that this succeeds locally, but may be fragile. Perhaps a retry wrap makes sense. + job.command(f'java -jar mitoreport.jar mito-map-download --output {job.output}') + batch_instance.write_output(job.output, output_path) + return job diff --git a/src/cpg_flow_mito/jobs/mito.py b/src/cpg_flow_mito/jobs/mito.py index 1f6f9a4..20529bc 100644 --- a/src/cpg_flow_mito/jobs/mito.py +++ b/src/cpg_flow_mito/jobs/mito.py @@ -703,6 +703,7 @@ def mitoreport( vcf_path: Path, cram_path: Path, output_path: Path, + annotations: Path, job_attrs: dict, ) -> Job: """ @@ -716,6 +717,7 @@ def mitoreport( res = resources.STANDARD.request_resources(ncpu=2) res.set_to_job(j) + localised_annotations = batch_instance.read_input(annotations) vcf = batch_instance.read_input_group(**{'vcf.gz': str(vcf_path)}) cram = batch_instance.read_input_group( **{ @@ -732,7 +734,7 @@ def mitoreport( java -jar mitoreport.jar mito-report \ -sample {sequencing_group.id} \ - -mann resources/mito_map_annotations.json \ + -mann {localised_annotations} \ -gnomad resources/gnomad.genomes.v3.1.sites.chrM.vcf.bgz \ -vcf {vcf['vcf.gz']} \ {sequencing_group.id}.bam ./resources/controls/*.bam diff --git a/src/cpg_flow_mito/stages.py b/src/cpg_flow_mito/stages.py index 8ea7f68..677b371 100644 --- a/src/cpg_flow_mito/stages.py +++ b/src/cpg_flow_mito/stages.py @@ -5,11 +5,47 @@ https://github.com/broadinstitute/gatk/blob/master/scripts/mitochondria_m2_wdl/MitochondriaPipeline.wdl """ + + +import zoneinfo +from datetime import datetime +from functools import cache + from cpg_flow import stage, targets, workflow -from cpg_utils import Path, config, hail_batch +from cpg_flow.stage import StageInput, StageOutput +from cpg_flow.targets import MultiCohort +from cpg_utils import Path, config, hail_batch, to_path + +from cpg_flow_mito.jobs import bcftools, mito, picard, vep, annotations_update + + + +@cache +def get_path_to_mito_ref_data(): + """Build a path to the expected MitoMap annotations.""" + tz = zoneinfo.ZoneInfo('Australia/Brisbane') + this_month_as_string = datetime.now(tz=tz).strftime('%Y-%m') + common_default = config.config_retrieve(['storage', 'common', 'default']) + return to_path(common_default) / 'mitoreport_ref' / this_month_as_string / 'mito_map_annotations.json' -from cpg_flow_mito.jobs import bcftools, mito, picard, vep +@stage.stage +class DownloadMitoMapData(stage.MultiCohortStage): + """A once-monthly download of the data required in Mitomap.""" + + def expected_outputs(self, _multicohort: MultiCohort) -> dict[str,Path]: + return { + 'annotations': get_path_to_mito_ref_data() + } + + def queue_jobs( + self, + multicohort: MultiCohort, + _inputs: StageInput, + ) -> StageOutput: + output = self.expected_outputs(multicohort) + job = annotations_update.download_latest_annotations(output['annotations'], job_attrs=self.get_job_attrs(multicohort),) + return self.make_outputs(multicohort, output, jobs=job) @stage.stage( analysis_type='mito-cram', @@ -339,7 +375,7 @@ def queue_jobs( @stage.stage( - required_stages=[RealignMito, GenotypeMito], + required_stages=[DownloadMitoMapData, RealignMito, GenotypeMito], analysis_type='web', analysis_keys=['mitoreport'], ) @@ -370,6 +406,9 @@ def queue_jobs( ) -> stage.StageOutput: outputs = self.expected_outputs(sequencing_group) + multicohort = workflow.get_multicohort() + mitomap_annotations = inputs.as_path(multicohort, DownloadMitoMapData, 'annotations') + jobs = [] vep_j = vep.vep_one( @@ -385,6 +424,7 @@ def queue_jobs( vcf_path=outputs['vep_vcf'], cram_path=inputs.as_path(sequencing_group, RealignMito, 'non_shifted_cram'), output_path=outputs['mitoreport'], + annotations=mitomap_annotations, job_attrs=self.get_job_attrs(sequencing_group), ) if mitoreport_j: From ad8874b09ddc0e41465d69dd1bcecfd50bbdf872 Mon Sep 17 00:00:00 2001 From: MattWellie Date: Fri, 9 Jan 2026 10:42:45 +1000 Subject: [PATCH 2/5] corrected quoting --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index dc96004..c0b4497 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -104,8 +104,8 @@ commit_args = "" [[tool.bumpversion.files]] filename = "pyproject.toml" -search = "version='{current_version}'" -replace = "version='{new_version}'" +search = 'version="{current_version}"' +replace = 'version="{new_version}"' [[tool.bumpversion.files]] filename = "Dockerfile" From e510c8841e052eafbd342256264807dc4056ccf9 Mon Sep 17 00:00:00 2001 From: MattWellie Date: Fri, 9 Jan 2026 10:47:43 +1000 Subject: [PATCH 3/5] linting --- pyproject.toml | 2 -- src/cpg_flow_mito/jobs/annotations_update.py | 2 +- src/cpg_flow_mito/stages.py | 17 ++++++++--------- 3 files changed, 9 insertions(+), 12 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index c0b4497..8ad05af 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -75,9 +75,7 @@ ignore = [ "ANN401", # Dynamically typed expressions (typing.Any) are disallowed "ANN204", # Missing type annotation for special method `__init__` "COM812", # Trailing comma prohibited - "E731", # Do not assign a lambda expression, use a def "G004", # Logging statement uses f-string - "PLW0603", # Using the global statement to update `` is discouraged "Q000", # Single quotes found but double quotes preferred "Q003", # Single quotes found but double quotes preferred "PLR0912", # Too many branches (> 12) diff --git a/src/cpg_flow_mito/jobs/annotations_update.py b/src/cpg_flow_mito/jobs/annotations_update.py index 82fd4d7..9eeb031 100644 --- a/src/cpg_flow_mito/jobs/annotations_update.py +++ b/src/cpg_flow_mito/jobs/annotations_update.py @@ -1,5 +1,5 @@ +from cpg_utils import Path, config, hail_batch -from cpg_utils import Path, hail_batch, config def download_latest_annotations(output_path: Path, job_attrs: dict[str, str]): """Trigger the MitoMap download, save to GCP.""" diff --git a/src/cpg_flow_mito/stages.py b/src/cpg_flow_mito/stages.py index 677b371..3c1fcb2 100644 --- a/src/cpg_flow_mito/stages.py +++ b/src/cpg_flow_mito/stages.py @@ -5,8 +5,6 @@ https://github.com/broadinstitute/gatk/blob/master/scripts/mitochondria_m2_wdl/MitochondriaPipeline.wdl """ - - import zoneinfo from datetime import datetime from functools import cache @@ -16,8 +14,7 @@ from cpg_flow.targets import MultiCohort from cpg_utils import Path, config, hail_batch, to_path -from cpg_flow_mito.jobs import bcftools, mito, picard, vep, annotations_update - +from cpg_flow_mito.jobs import annotations_update, bcftools, mito, picard, vep @cache @@ -33,10 +30,8 @@ def get_path_to_mito_ref_data(): class DownloadMitoMapData(stage.MultiCohortStage): """A once-monthly download of the data required in Mitomap.""" - def expected_outputs(self, _multicohort: MultiCohort) -> dict[str,Path]: - return { - 'annotations': get_path_to_mito_ref_data() - } + def expected_outputs(self, _multicohort: MultiCohort) -> dict[str, Path]: + return {'annotations': get_path_to_mito_ref_data()} def queue_jobs( self, @@ -44,9 +39,13 @@ def queue_jobs( _inputs: StageInput, ) -> StageOutput: output = self.expected_outputs(multicohort) - job = annotations_update.download_latest_annotations(output['annotations'], job_attrs=self.get_job_attrs(multicohort),) + job = annotations_update.download_latest_annotations( + output['annotations'], + job_attrs=self.get_job_attrs(multicohort), + ) return self.make_outputs(multicohort, output, jobs=job) + @stage.stage( analysis_type='mito-cram', analysis_keys=['non_shifted_cram'], From 7665ac0734063f041c2e5471d869425ae0230ed5 Mon Sep 17 00:00:00 2001 From: MattWellie Date: Fri, 9 Jan 2026 11:23:05 +1000 Subject: [PATCH 4/5] include auto-retry behaviour during update --- src/cpg_flow_mito/jobs/annotations_update.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/src/cpg_flow_mito/jobs/annotations_update.py b/src/cpg_flow_mito/jobs/annotations_update.py index 9eeb031..1f1062e 100644 --- a/src/cpg_flow_mito/jobs/annotations_update.py +++ b/src/cpg_flow_mito/jobs/annotations_update.py @@ -9,6 +9,14 @@ def download_latest_annotations(output_path: Path, job_attrs: dict[str, str]): job.image(config.config_retrieve(['images', 'mitoreport'])) # noted that this succeeds locally, but may be fragile. Perhaps a retry wrap makes sense. - job.command(f'java -jar mitoreport.jar mito-map-download --output {job.output}') + job.command(f""" + n=0 + until [ "$n" -ge 5 ] + do + java -jar mitoreport.jar mito-map-download --output {job.output} && break + n=$((n+1)) + sleep 20 + done + """) batch_instance.write_output(job.output, output_path) return job From 5c04100681709b0bd66cbab1f292caebc0f70105 Mon Sep 17 00:00:00 2001 From: MattWellie Date: Fri, 9 Jan 2026 15:58:57 +1000 Subject: [PATCH 5/5] lint --- src/cpg_flow_mito/jobs/annotations_update.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cpg_flow_mito/jobs/annotations_update.py b/src/cpg_flow_mito/jobs/annotations_update.py index 1f1062e..8f85260 100644 --- a/src/cpg_flow_mito/jobs/annotations_update.py +++ b/src/cpg_flow_mito/jobs/annotations_update.py @@ -14,7 +14,7 @@ def download_latest_annotations(output_path: Path, job_attrs: dict[str, str]): until [ "$n" -ge 5 ] do java -jar mitoreport.jar mito-map-download --output {job.output} && break - n=$((n+1)) + n=$((n+1)) sleep 20 done """)