Skip to content

Commit

Permalink
Refine combiner usage (#1097)
Browse files Browse the repository at this point in the history
* allow granularity on all combiner options

* Bump version: 1.32.13 → 1.32.14
  • Loading branch information
MattWellie authored Jan 10, 2025
1 parent 49e1ea5 commit fcc5135
Show file tree
Hide file tree
Showing 5 changed files with 38 additions and 3 deletions.
2 changes: 1 addition & 1 deletion .bumpversion.cfg
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
[bumpversion]
current_version = 1.32.13
current_version = 1.32.14
commit = True
tag = False

Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/docker.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ permissions:
contents: read

env:
VERSION: 1.32.13
VERSION: 1.32.14

jobs:
docker:
Expand Down
15 changes: 15 additions & 0 deletions configs/defaults/rd_combiner.toml
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,21 @@ worker_memory = "standard"
# if false, use non-preemptible VMs
preemptible_vms = false

# these settings alter the behaviour of the combiner, and don't all align with the documentation

# In config: "The number of Variant Datasets to combine at once."
# In practice: "The number of gVCFs to combine into each VDS?"
# https://github.com/hail-is/hail/issues/14781
branch_factor = 50

# when merging multiple VDS, we find the largest VDS, repartition to target_records variants per partition
# then repartition all VDSs to match those intervals prior to merging
target_records = 30000

# this is supposed to be the number of gVCFs to combine into a VDS
# but that is not curretly working. See issue above
gvcf_batch_size = 5

[vqsr]
# VQSR, when applying model, targets indel_filter_level and snp_filter_level
# sensitivities. The tool matches them internally to a VQSLOD score cutoff
Expand Down
20 changes: 20 additions & 0 deletions cpg_workflows/jobs/rd_combiner/combiner.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def run(
import logging

import hail as hl
from hail.vds.combiner.variant_dataset_combiner import VariantDatasetCombiner

from cpg_utils.config import config_retrieve
from cpg_utils.hail_batch import init_batch
Expand Down Expand Up @@ -76,6 +77,25 @@ def run(
use_genome_default_intervals=sequencing_type == 'genome',
intervals=intervals,
force=force_new_combiner,
# we're defaulting to the protected class attributes here, which looks like a hack...
# for branch factor and target records, the argument uses a specific value as a default
# so if we don't find an entry in config, we can't pass None to the constructor...
# we either access the protected class attributes, hard-code the default on our side,
# or have two separate constructors depending on whether we override the default or not
branch_factor=config_retrieve(
['combiner', 'branch_factor'],
VariantDatasetCombiner._default_branch_factor,
),
target_records=config_retrieve(
['combiner', 'target_records'],
VariantDatasetCombiner._default_target_records,
),
# this argument does default to None, and will be set to the default values within the constructor
# so we're happy to pass None, no need to access the protected class attributes
gvcf_batch_size=config_retrieve(
['combiner', 'gvcf_batch_size'],
None,
),
)

combiner.run()
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
setup(
name='cpg_workflows',
# This tag is automatically updated by bumpversion
version='1.32.13',
version='1.32.14',
description='CPG workflows for Hail Batch',
long_description=open('README.md').read(),
long_description_content_type='text/markdown',
Expand Down

0 comments on commit fcc5135

Please sign in to comment.