Skip to content

Commit 2019bce

Browse files
authored
Merge pull request #133 from molgenis/fix/allele_frequency
fix: Allele frequency support
2 parents 9c3f0aa + b769754 commit 2019bce

File tree

6 files changed

+17
-15
lines changed

6 files changed

+17
-15
lines changed

README.md

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ CAPICE can be used as online service at http://molgenis.org/capice
1616
* [SpliceAI](https://m.ensembl.org/info/docs/tools/vep/script/vep_plugins.html#spliceai)
1717
* BCF tools v1.14-1
1818
* Python >=3.8
19+
* [Stripped gnomAD allele frequency counts](https://download.molgeniscloud.org/downloads/vip/resources/GRCh37/gnomad.total.r2.1.1.sites.stripped.vcf.gz) with [indexing file](https://download.molgeniscloud.org/downloads/vip/resources/GRCh37/gnomad.total.r2.1.1.sites.stripped.vcf.gz.csi) (for GRCh38: [data](https://download.molgeniscloud.org/downloads/vip/resources/GRCh38/gnomad.genomes.v3.1.2.sites.stripped.vcf.gz) + [indexing file](https://download.molgeniscloud.org/downloads/vip/resources/GRCh38/gnomad.genomes.v3.1.2.sites.stripped.vcf.gz.csi))
1920

2021
## Install
2122
The CAPICE software is also provided in this repository for running CAPICE in your own environment. The following
@@ -71,6 +72,7 @@ vep --input_file <path to your input file> --format vcf --output_file <path to y
7172
--no_stats --offline --cache --dir_cache </path/to/cache/105> --species "homo_sapiens" --assembly <GRCh37 or GRCh38> --fork 4
7273
--dont_skip --allow_non_variant --use_given_ref --exclude_predicted --flag_pick_allele
7374
--plugin SpliceAI,snv=<path/to/raw_scores_snv.vcf.gz>,indel=</path/to/raw_scores_indel.vcf.gz> --dir_plugins <path to your VEP plugin directory>
75+
--custom </path/to/stripped/gnomad.vcf.gz>,gnomAD,vcf,exact,0,AF,HN
7476
```
7577

7678
Then you have to convert the VEP output to TSV using our own BCFTools script:

resources/train_input.tsv.gz

2 Bytes
Binary file not shown.

scripts/balance_dataset.py

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -4,8 +4,8 @@
44
(%)Consequence: The consequence a variant has (Del, ins, DELINS etc.).
55
Can be supplied raw (starting with %) or processed. The most important part
66
is Consequence itself.
7-
MAX_AF: The desired allele frequency per variant. Can originate from gnomAD or
8-
1000 genomes or NHLBI-ESP, but has to be named MAX_AF.
7+
gnomAD_AF: The desired allele frequency per variant. Can originate from gnomAD or
8+
1000 genomes or NHLBI-ESP, but has to be named gnomAD_AF.
99
1010
"""
1111
import os
@@ -76,9 +76,9 @@ def _create_argument_parser():
7676
prog=os.path.basename(__file__),
7777
description='Helper script to balance out an possible input '
7878
'dataset on allele frequency and Consequence. Requires '
79-
'the columns (%)Consequence, (%)MAX_AF and '
80-
'binarized_label. MAX_AF can originate from anywhere, '
81-
'as long as it is called MAX_AF. Note: when -s/--split '
79+
'the columns (%)Consequence, (%)gnomAD_AF and '
80+
'binarized_label. gnomAD_AF can originate from anywhere, '
81+
'as long as it is called gnomAD_AF. Note: when -s/--split '
8282
'is called, it will split before balancing.'
8383
)
8484

@@ -175,7 +175,7 @@ class InputDatasetValidator:
175175

176176
@staticmethod
177177
def validate_columns_required(dataset: pd.DataFrame):
178-
required_columns = ['Consequence', 'MAX_AF', 'binarized_label']
178+
required_columns = ['Consequence', 'gnomAD_AF', 'binarized_label']
179179
for col in required_columns:
180180
if col not in dataset.columns:
181181
raise KeyError(f'Required column {col} not found in input dataset.')
@@ -259,7 +259,7 @@ def _process_consequence(self, pathogenic_dataset, benign_dataset):
259259
random_state=__random_state__
260260
)
261261
pathogenic_histogram, bins = np.histogram(
262-
pathogenic_dataset['MAX_AF'],
262+
pathogenic_dataset['gnomAD_AF'],
263263
bins=self.bins
264264
)
265265
processed_bins = pd.DataFrame(columns=self.columns)
@@ -299,7 +299,7 @@ def _process_bins(
299299

300300
@staticmethod
301301
def _get_variants_within_range(dataset, upper_bound, lower_bound):
302-
return dataset[(dataset['MAX_AF'] >= lower_bound) & (dataset['MAX_AF'] < upper_bound)]
302+
return dataset[(dataset['gnomAD_AF'] >= lower_bound) & (dataset['gnomAD_AF'] < upper_bound)]
303303

304304

305305
class BalanceExporter:

scripts/convert_vep_vcf_to_tsv_capice.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
set -e
55

66
# Possibly variable variables
7-
PRE_HEADER="%CHROM\t%POS\t%REF\t%ALT\t%Consequence\t%SYMBOL\t%SYMBOL_SOURCE\t%Gene\t%Feature\t%Feature_type\t%cDNA_position\t%CDS_position\t%Protein_position\t%Amino_acids\t%STRAND\t%SIFT\t%PolyPhen\t%EXON\t%INTRON\t%SpliceAI_pred_DP_AG\t%SpliceAI_pred_DP_AL\t%SpliceAI_pred_DP_DG\t%SpliceAI_pred_DP_DL\t%SpliceAI_pred_DS_AG\t%SpliceAI_pred_DS_AL\t%SpliceAI_pred_DS_DG\t%SpliceAI_pred_DS_DL"
7+
PRE_HEADER="%CHROM\t%POS\t%REF\t%ALT\t%Consequence\t%SYMBOL\t%SYMBOL_SOURCE\t%Gene\t%Feature\t%Feature_type\t%cDNA_position\t%CDS_position\t%Protein_position\t%Amino_acids\t%STRAND\t%SIFT\t%PolyPhen\t%EXON\t%INTRON\t%SpliceAI_pred_DP_AG\t%SpliceAI_pred_DP_AL\t%SpliceAI_pred_DP_DG\t%SpliceAI_pred_DP_DL\t%SpliceAI_pred_DS_AG\t%SpliceAI_pred_DS_AL\t%SpliceAI_pred_DS_DG\t%SpliceAI_pred_DS_DL\t%gnomAD_AF"
88

99
# Defines error echo.
1010
errcho() { echo "$@" 1>&2; }

src/molgenis/capice/__init__.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1 +1 @@
1-
__version__ = '3.2.1'
1+
__version__ = '3.3.0'

tests/capice/test_balance_dataset.py

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -94,11 +94,11 @@ def test_balancer(self):
9494
lower_bound = __bins__[ind]
9595
upper_bound = __bins__[ind + 1]
9696
self.assertEqual(
97-
balanced_dataset[(balanced_dataset['MAX_AF'] >= lower_bound) &
98-
(balanced_dataset['MAX_AF'] < upper_bound) &
97+
balanced_dataset[(balanced_dataset['gnomAD_AF'] >= lower_bound) &
98+
(balanced_dataset['gnomAD_AF'] < upper_bound) &
9999
(balanced_dataset['binarized_label'] == 0)].shape[0],
100-
balanced_dataset[(balanced_dataset['MAX_AF'] >= lower_bound) &
101-
(balanced_dataset['MAX_AF'] < upper_bound) &
100+
balanced_dataset[(balanced_dataset['gnomAD_AF'] >= lower_bound) &
101+
(balanced_dataset['gnomAD_AF'] < upper_bound) &
102102
(balanced_dataset['binarized_label'] == 1)].shape[0]
103103
)
104104

@@ -145,7 +145,7 @@ def test_dataset_validator(self):
145145
dataset = self.dataset.copy(deep=True)
146146
self.assertRaises(KeyError,
147147
validator.validate_columns_required,
148-
dataset.drop(columns=['MAX_AF']))
148+
dataset.drop(columns=['gnomAD_AF']))
149149
self.assertRaises(ValueError,
150150
validator.validate_b_p_present,
151151
dataset[dataset['binarized_label'] == 0])

0 commit comments

Comments
 (0)