From cf6f718b97d0eaa26d1161a0e16ae912d3b427e0 Mon Sep 17 00:00:00 2001
From: SietsmaRJ <work.robertsietsma@gmail.com>
Date: Thu, 29 Sep 2022 10:30:52 +0200
Subject: [PATCH 1/2] feat: Simplified BCFTools

- Removed the function to correct the column names, since that will become deprecated if the PR is merged on CAPICE Master for the simplified controls.
---
 utility_scripts/compare_models.py          | 12 ------------
 utility_scripts/compare_to_legacy_model.py |  3 ---
 2 files changed, 15 deletions(-)

diff --git a/utility_scripts/compare_models.py b/utility_scripts/compare_models.py
index d97ec8b..d2b0cfe 100755
--- a/utility_scripts/compare_models.py
+++ b/utility_scripts/compare_models.py
@@ -190,16 +190,6 @@ def validate_output_argument(output):
                 exit(1)
 
 
-def correct_column_names(columns: typing.Iterable):
-    processed_columns = []
-    for column in columns:
-        if column.startswith('%'):
-            processed_columns.append(column.split('%')[1])
-        else:
-            processed_columns.append(column)
-    return processed_columns
-
-
 def split_consequences(consequences: pd.Series):
     splitted_consequences = consequences.str.split('&', expand=True)
     return pd.Series(splitted_consequences.values.ravel()).dropna().sort_values(
@@ -208,10 +198,8 @@ def split_consequences(consequences: pd.Series):
 
 def prepare_data_file(validator, scores, labels, model_number, force_merge):
     scores_model = pd.read_csv(scores, sep='\t', na_values='.')
-    scores_model.columns = correct_column_names(scores_model.columns)
     validator.validate_score_column_present(scores_model, model_number)
     labels_model = pd.read_csv(labels, sep='\t', na_values='.')
-    labels_model.columns = correct_column_names(labels_model.columns)
     validator.validate_bl_column_present(labels_model, model_number)
     m_cons = validator.validate_consequence_column_present(labels_model)
     if scores_model.shape[0] == labels_model.shape[0]:
diff --git a/utility_scripts/compare_to_legacy_model.py b/utility_scripts/compare_to_legacy_model.py
index 47f1e8a..5faf674 100755
--- a/utility_scripts/compare_to_legacy_model.py
+++ b/utility_scripts/compare_to_legacy_model.py
@@ -9,8 +9,6 @@
 from matplotlib import pyplot as plt
 from sklearn.metrics import roc_curve, auc
 
-from utility_scripts.compare_models import correct_column_names
-
 ID_SEPARATOR = '!'
 
 
@@ -194,7 +192,6 @@ def main():
     old_labels = pd.read_csv(old_labels, sep='\t', skiprows=n_skip)
     new_scores = pd.read_csv(new_scores, sep='\t')
     new_labels = pd.read_csv(new_labels, sep='\t')
-    new_labels.columns = correct_column_names(new_labels.columns)
     print('Validating data')
     validator.validate_old_scores_dataset(old_scores)
     validator.validate_old_labels_dataset(old_labels)

From 663ba9b03c76fec9a28cf8c1ce7d3f5330c97547 Mon Sep 17 00:00:00 2001
From: SietsmaRJ <work.robertsietsma@gmail.com>
Date: Thu, 29 Sep 2022 13:27:14 +0200
Subject: [PATCH 2/2] Further removed dependency on the % sign.

---
 utility_scripts/process_vep_tsv.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/utility_scripts/process_vep_tsv.py b/utility_scripts/process_vep_tsv.py
index a1ab926..174a7d9 100755
--- a/utility_scripts/process_vep_tsv.py
+++ b/utility_scripts/process_vep_tsv.py
@@ -84,7 +84,7 @@ def validate_output_cla(output_path):
 
     @staticmethod
     def validate_input_dataset(input_data):
-        columns_must_be_present = ['%SYMBOL', '%CHROM', '%ID']
+        columns_must_be_present = ['SYMBOL', 'CHROM', 'ID']
         for column in columns_must_be_present:
             if column not in input_data.columns:
                 raise DataError(f'Missing required column: {column}')
@@ -113,16 +113,16 @@ def main():
     
     print('Dropping entries without gene.')
     before_drop = data.shape[0]
-    data.drop(index=data[data['%SYMBOL'].isnull()].index, inplace=True)
+    data.drop(index=data[data['SYMBOL'].isnull()].index, inplace=True)
     after_drop = data.shape[0]
     print(f'Dropped {before_drop-after_drop} variants.\n')
 
     if grch38:
         print('Converting chromosome column')
-        data['%CHROM'] = data['%CHROM'].str.split('chr', expand=True)[1]
+        data['CHROM'] = data['CHROM'].str.split('chr', expand=True)[1]
         y = np.append(np.arange(1, 23).astype(str), ['X', 'Y', 'MT'])
         before_drop = data.shape[0]
-        data.drop(data[~data["%CHROM"].isin(y)].index, inplace=True)
+        data.drop(data[~data["CHROM"].isin(y)].index, inplace=True)
         after_drop = data.shape[0]
         print(f'Dropped {before_drop-after_drop} rows due to unknown chromosome.')
         print('Conversion done.\n')
@@ -137,21 +137,21 @@ def main():
     print('Dropping mismatching gene entries.')
     before_drop = data.shape[0]
     data.drop(
-        index=data[data['%ID'].str.split(ID_SEPARATOR, expand=True)[4] != data['%SYMBOL']].index,
+        index=data[data['ID'].str.split(ID_SEPARATOR, expand=True)[4] != data['SYMBOL']].index,
         inplace=True
     )
     after_drop = data.shape[0]
     print(f'Dropped {before_drop-after_drop} variants.\n')
     
     print('Extracting sample weight and binarized_label')
-    data['binarized_label'] = data['%ID'].str.split(ID_SEPARATOR, expand=True)[5].astype(float)
-    data['sample_weight'] = data['%ID'].str.split(ID_SEPARATOR, expand=True)[6].astype(float)
+    data['binarized_label'] = data['ID'].str.split(ID_SEPARATOR, expand=True)[5].astype(float)
+    data['sample_weight'] = data['ID'].str.split(ID_SEPARATOR, expand=True)[6].astype(float)
     print('')
     
     print('Correcting possible errors within binarized_label or sample_weight')
     before_drop = data.shape[0]
     # Drop everything that doesn't have a binarized_label, also drop unused columns
-    data.drop(index=data[data['binarized_label'].isnull()].index, columns=['%ID'], inplace=True)
+    data.drop(index=data[data['binarized_label'].isnull()].index, columns=['ID'], inplace=True)
     data.drop(index=data[~data['binarized_label'].isin([0.0, 1.0])].index, inplace=True)
     data.drop(index=data[~data['sample_weight'].isin(SAMPLE_WEIGHTS)].index, inplace=True)
     after_drop = data.shape[0]