Merge pull request #30 from molgenis/feat/simplified_bcftools

Feat/simplified bcftools
molgenis · Oct 4, 2022 · fb5ac8f · fb5ac8f
2 parents 2e43ad2 + 17c9821
commit fb5ac8f
Show file tree

Hide file tree

Showing 3 changed files with 8 additions and 23 deletions.
diff --git a/utility_scripts/compare_models.py b/utility_scripts/compare_models.py
@@ -190,16 +190,6 @@ def validate_output_argument(output):
                 exit(1)
 
 
-def correct_column_names(columns: typing.Iterable):
-    processed_columns = []
-    for column in columns:
-        if column.startswith('%'):
-            processed_columns.append(column.split('%')[1])
-        else:
-            processed_columns.append(column)
-    return processed_columns
-
-
 def split_consequences(consequences: pd.Series):
     splitted_consequences = consequences.str.split('&', expand=True)
     return pd.Series(splitted_consequences.values.ravel()).dropna().sort_values(
@@ -208,10 +198,8 @@ def split_consequences(consequences: pd.Series):
 
 def prepare_data_file(validator, scores, labels, model_number, force_merge):
     scores_model = pd.read_csv(scores, sep='\t', na_values='.')
-    scores_model.columns = correct_column_names(scores_model.columns)
     validator.validate_score_column_present(scores_model, model_number)
     labels_model = pd.read_csv(labels, sep='\t', na_values='.')
-    labels_model.columns = correct_column_names(labels_model.columns)
     validator.validate_bl_column_present(labels_model, model_number)
     m_cons = validator.validate_consequence_column_present(labels_model)
     if scores_model.shape[0] == labels_model.shape[0]:

diff --git a/utility_scripts/compare_to_legacy_model.py b/utility_scripts/compare_to_legacy_model.py
@@ -9,8 +9,6 @@
 from matplotlib import pyplot as plt
 from sklearn.metrics import roc_curve, auc
 
-from utility_scripts.compare_models import correct_column_names
-
 ID_SEPARATOR = '!'
 
 
@@ -194,7 +192,6 @@ def main():
     old_labels = pd.read_csv(old_labels, sep='\t', skiprows=n_skip)
     new_scores = pd.read_csv(new_scores, sep='\t')
     new_labels = pd.read_csv(new_labels, sep='\t')
-    new_labels.columns = correct_column_names(new_labels.columns)
     print('Validating data')
     validator.validate_old_scores_dataset(old_scores)
     validator.validate_old_labels_dataset(old_labels)

diff --git a/utility_scripts/process_vep_tsv.py b/utility_scripts/process_vep_tsv.py
@@ -84,7 +84,7 @@ def validate_output_cla(output_path):
 
     @staticmethod
     def validate_input_dataset(input_data):
-        columns_must_be_present = ['%SYMBOL', '%CHROM', '%ID']
+        columns_must_be_present = ['SYMBOL', 'CHROM', 'ID']
         for column in columns_must_be_present:
             if column not in input_data.columns:
                 raise DataError(f'Missing required column: {column}')
@@ -113,16 +113,16 @@ def main():
 
     print('Dropping entries without gene.')
     before_drop = data.shape[0]
-    data.drop(index=data[data['%SYMBOL'].isnull()].index, inplace=True)
+    data.drop(index=data[data['SYMBOL'].isnull()].index, inplace=True)
     after_drop = data.shape[0]
     print(f'Dropped {before_drop-after_drop} variants.\n')
 
     if grch38:
         print('Converting chromosome column')
-        data['%CHROM'] = data['%CHROM'].str.split('chr', expand=True)[1]
+        data['CHROM'] = data['CHROM'].str.split('chr', expand=True)[1]
         y = np.append(np.arange(1, 23).astype(str), ['X', 'Y', 'MT'])
         before_drop = data.shape[0]
-        data.drop(data[~data["%CHROM"].isin(y)].index, inplace=True)
+        data.drop(data[~data["CHROM"].isin(y)].index, inplace=True)
         after_drop = data.shape[0]
         print(f'Dropped {before_drop-after_drop} rows due to unknown chromosome.')
         print('Conversion done.\n')
@@ -137,21 +137,21 @@ def main():
     print('Dropping mismatching gene entries.')
     before_drop = data.shape[0]
     data.drop(
-        index=data[data['%ID'].str.split(ID_SEPARATOR, expand=True)[4] != data['%SYMBOL']].index,
+        index=data[data['ID'].str.split(ID_SEPARATOR, expand=True)[4] != data['SYMBOL']].index,
         inplace=True
     )
     after_drop = data.shape[0]
     print(f'Dropped {before_drop-after_drop} variants.\n')
 
     print('Extracting sample weight and binarized_label')
-    data['binarized_label'] = data['%ID'].str.split(ID_SEPARATOR, expand=True)[5].astype(float)
-    data['sample_weight'] = data['%ID'].str.split(ID_SEPARATOR, expand=True)[6].astype(float)
+    data['binarized_label'] = data['ID'].str.split(ID_SEPARATOR, expand=True)[5].astype(float)
+    data['sample_weight'] = data['ID'].str.split(ID_SEPARATOR, expand=True)[6].astype(float)
     print('')
 
     print('Correcting possible errors within binarized_label or sample_weight')
     before_drop = data.shape[0]
     # Drop everything that doesn't have a binarized_label, also drop unused columns
-    data.drop(index=data[data['binarized_label'].isnull()].index, columns=['%ID'], inplace=True)
+    data.drop(index=data[data['binarized_label'].isnull()].index, columns=['ID'], inplace=True)
     data.drop(index=data[~data['binarized_label'].isin([0.0, 1.0])].index, inplace=True)
     data.drop(index=data[~data['sample_weight'].isin(SAMPLE_WEIGHTS)].index, inplace=True)
     after_drop = data.shape[0]