Merge pull request #133 from molgenis/fix/allele_frequency

svandenhoek · web-flow · commit 2019bce49605 · 2022-09-15T15:02:06.000+02:00
fix: Allele frequency support
diff --git a/README.md b/README.md
@@ -16,6 +16,7 @@ CAPICE can be used as online service at http://molgenis.org/capice
   * [SpliceAI](https://m.ensembl.org/info/docs/tools/vep/script/vep_plugins.html#spliceai)
 * BCF tools v1.14-1
 * Python >=3.8
+* [Stripped gnomAD allele frequency counts](https://download.molgeniscloud.org/downloads/vip/resources/GRCh37/gnomad.total.r2.1.1.sites.stripped.vcf.gz) with [indexing file](https://download.molgeniscloud.org/downloads/vip/resources/GRCh37/gnomad.total.r2.1.1.sites.stripped.vcf.gz.csi) (for GRCh38: [data](https://download.molgeniscloud.org/downloads/vip/resources/GRCh38/gnomad.genomes.v3.1.2.sites.stripped.vcf.gz) + [indexing file](https://download.molgeniscloud.org/downloads/vip/resources/GRCh38/gnomad.genomes.v3.1.2.sites.stripped.vcf.gz.csi))
 
 ## Install
 The CAPICE software is also provided in this repository for running CAPICE in your own environment. The following
@@ -71,6 +72,7 @@ vep --input_file <path to your input file> --format vcf --output_file <path to y
 --no_stats --offline --cache --dir_cache </path/to/cache/105> --species "homo_sapiens" --assembly <GRCh37 or GRCh38> --fork 4 
 --dont_skip --allow_non_variant --use_given_ref --exclude_predicted --flag_pick_allele
 --plugin SpliceAI,snv=<path/to/raw_scores_snv.vcf.gz>,indel=</path/to/raw_scores_indel.vcf.gz> --dir_plugins <path to your VEP plugin directory>
+--custom </path/to/stripped/gnomad.vcf.gz>,gnomAD,vcf,exact,0,AF,HN
 ```
 
 Then you have to convert the VEP output to TSV using our own BCFTools script: 
diff --git a/resources/train_input.tsv.gz b/resources/train_input.tsv.gz
diff --git a/scripts/balance_dataset.py b/scripts/balance_dataset.py
@@ -4,8 +4,8 @@
 (%)Consequence: The consequence a variant has (Del, ins, DELINS etc.).
     Can be supplied raw (starting with %) or processed. The most important part
     is Consequence itself.
-MAX_AF: The desired allele frequency per variant. Can originate from gnomAD or
-1000 genomes or NHLBI-ESP, but has to be named MAX_AF.
+gnomAD_AF: The desired allele frequency per variant. Can originate from gnomAD or
+1000 genomes or NHLBI-ESP, but has to be named gnomAD_AF.
 
 """
 import os
@@ -76,9 +76,9 @@ def _create_argument_parser():
             prog=os.path.basename(__file__),
             description='Helper script to balance out an possible input '
                         'dataset on allele frequency and Consequence. Requires '
-                        'the columns (%)Consequence, (%)MAX_AF and '
-                        'binarized_label. MAX_AF can originate from anywhere, '
-                        'as long as it is called MAX_AF. Note: when -s/--split '
+                        'the columns (%)Consequence, (%)gnomAD_AF and '
+                        'binarized_label. gnomAD_AF can originate from anywhere, '
+                        'as long as it is called gnomAD_AF. Note: when -s/--split '
                         'is called, it will split before balancing.'
         )
 
@@ -175,7 +175,7 @@ class InputDatasetValidator:
 
     @staticmethod
     def validate_columns_required(dataset: pd.DataFrame):
-        required_columns = ['Consequence', 'MAX_AF', 'binarized_label']
+        required_columns = ['Consequence', 'gnomAD_AF', 'binarized_label']
         for col in required_columns:
             if col not in dataset.columns:
                 raise KeyError(f'Required column {col} not found in input dataset.')
@@ -259,7 +259,7 @@ def _process_consequence(self, pathogenic_dataset, benign_dataset):
                 random_state=__random_state__
             )
         pathogenic_histogram, bins = np.histogram(
-            pathogenic_dataset['MAX_AF'],
+            pathogenic_dataset['gnomAD_AF'],
             bins=self.bins
         )
         processed_bins = pd.DataFrame(columns=self.columns)
@@ -299,7 +299,7 @@ def _process_bins(
 
     @staticmethod
     def _get_variants_within_range(dataset, upper_bound, lower_bound):
-        return dataset[(dataset['MAX_AF'] >= lower_bound) & (dataset['MAX_AF'] < upper_bound)]
+        return dataset[(dataset['gnomAD_AF'] >= lower_bound) & (dataset['gnomAD_AF'] < upper_bound)]
 
 
 class BalanceExporter:
diff --git a/scripts/convert_vep_vcf_to_tsv_capice.sh b/scripts/convert_vep_vcf_to_tsv_capice.sh
@@ -4,7 +4,7 @@
 set -e
 
 # Possibly variable variables
-PRE_HEADER="%CHROM\t%POS\t%REF\t%ALT\t%Consequence\t%SYMBOL\t%SYMBOL_SOURCE\t%Gene\t%Feature\t%Feature_type\t%cDNA_position\t%CDS_position\t%Protein_position\t%Amino_acids\t%STRAND\t%SIFT\t%PolyPhen\t%EXON\t%INTRON\t%SpliceAI_pred_DP_AG\t%SpliceAI_pred_DP_AL\t%SpliceAI_pred_DP_DG\t%SpliceAI_pred_DP_DL\t%SpliceAI_pred_DS_AG\t%SpliceAI_pred_DS_AL\t%SpliceAI_pred_DS_DG\t%SpliceAI_pred_DS_DL"
+PRE_HEADER="%CHROM\t%POS\t%REF\t%ALT\t%Consequence\t%SYMBOL\t%SYMBOL_SOURCE\t%Gene\t%Feature\t%Feature_type\t%cDNA_position\t%CDS_position\t%Protein_position\t%Amino_acids\t%STRAND\t%SIFT\t%PolyPhen\t%EXON\t%INTRON\t%SpliceAI_pred_DP_AG\t%SpliceAI_pred_DP_AL\t%SpliceAI_pred_DP_DG\t%SpliceAI_pred_DP_DL\t%SpliceAI_pred_DS_AG\t%SpliceAI_pred_DS_AL\t%SpliceAI_pred_DS_DG\t%SpliceAI_pred_DS_DL\t%gnomAD_AF"
 
 # Defines error echo.
 errcho() { echo "$@" 1>&2; }
diff --git a/src/molgenis/capice/__init__.py b/src/molgenis/capice/__init__.py
@@ -1 +1 @@
-__version__ = '3.2.1'
+__version__ = '3.3.0'
diff --git a/tests/capice/test_balance_dataset.py b/tests/capice/test_balance_dataset.py
@@ -94,11 +94,11 @@ def test_balancer(self):
             lower_bound = __bins__[ind]
             upper_bound = __bins__[ind + 1]
             self.assertEqual(
-                balanced_dataset[(balanced_dataset['MAX_AF'] >= lower_bound) &
-                                 (balanced_dataset['MAX_AF'] < upper_bound) &
+                balanced_dataset[(balanced_dataset['gnomAD_AF'] >= lower_bound) &
+                                 (balanced_dataset['gnomAD_AF'] < upper_bound) &
                                  (balanced_dataset['binarized_label'] == 0)].shape[0],
-                balanced_dataset[(balanced_dataset['MAX_AF'] >= lower_bound) &
-                                 (balanced_dataset['MAX_AF'] < upper_bound) &
+                balanced_dataset[(balanced_dataset['gnomAD_AF'] >= lower_bound) &
+                                 (balanced_dataset['gnomAD_AF'] < upper_bound) &
                                  (balanced_dataset['binarized_label'] == 1)].shape[0]
             )
 
@@ -145,7 +145,7 @@ def test_dataset_validator(self):
         dataset = self.dataset.copy(deep=True)
         self.assertRaises(KeyError,
                           validator.validate_columns_required,
-                          dataset.drop(columns=['MAX_AF']))
+                          dataset.drop(columns=['gnomAD_AF']))
         self.assertRaises(ValueError,
                           validator.validate_b_p_present,
                           dataset[dataset['binarized_label'] == 0])

Original file line number	Diff line number	Diff line change
`@@ -1 +1 @@`
`1`		`-__version__ = '3.2.1'`
	`1`	`+__version__ = '3.3.0'`