From cf6f718b97d0eaa26d1161a0e16ae912d3b427e0 Mon Sep 17 00:00:00 2001 From: SietsmaRJ Date: Thu, 29 Sep 2022 10:30:52 +0200 Subject: [PATCH 1/2] feat: Simplified BCFTools - Removed the function to correct the column names, since that will become deprecated if the PR is merged on CAPICE Master for the simplified controls. --- utility_scripts/compare_models.py | 12 ------------ utility_scripts/compare_to_legacy_model.py | 3 --- 2 files changed, 15 deletions(-) diff --git a/utility_scripts/compare_models.py b/utility_scripts/compare_models.py index d97ec8b..d2b0cfe 100755 --- a/utility_scripts/compare_models.py +++ b/utility_scripts/compare_models.py @@ -190,16 +190,6 @@ def validate_output_argument(output): exit(1) -def correct_column_names(columns: typing.Iterable): - processed_columns = [] - for column in columns: - if column.startswith('%'): - processed_columns.append(column.split('%')[1]) - else: - processed_columns.append(column) - return processed_columns - - def split_consequences(consequences: pd.Series): splitted_consequences = consequences.str.split('&', expand=True) return pd.Series(splitted_consequences.values.ravel()).dropna().sort_values( @@ -208,10 +198,8 @@ def split_consequences(consequences: pd.Series): def prepare_data_file(validator, scores, labels, model_number, force_merge): scores_model = pd.read_csv(scores, sep='\t', na_values='.') - scores_model.columns = correct_column_names(scores_model.columns) validator.validate_score_column_present(scores_model, model_number) labels_model = pd.read_csv(labels, sep='\t', na_values='.') - labels_model.columns = correct_column_names(labels_model.columns) validator.validate_bl_column_present(labels_model, model_number) m_cons = validator.validate_consequence_column_present(labels_model) if scores_model.shape[0] == labels_model.shape[0]: diff --git a/utility_scripts/compare_to_legacy_model.py b/utility_scripts/compare_to_legacy_model.py index 47f1e8a..5faf674 100755 --- a/utility_scripts/compare_to_legacy_model.py +++ b/utility_scripts/compare_to_legacy_model.py @@ -9,8 +9,6 @@ from matplotlib import pyplot as plt from sklearn.metrics import roc_curve, auc -from utility_scripts.compare_models import correct_column_names - ID_SEPARATOR = '!' @@ -194,7 +192,6 @@ def main(): old_labels = pd.read_csv(old_labels, sep='\t', skiprows=n_skip) new_scores = pd.read_csv(new_scores, sep='\t') new_labels = pd.read_csv(new_labels, sep='\t') - new_labels.columns = correct_column_names(new_labels.columns) print('Validating data') validator.validate_old_scores_dataset(old_scores) validator.validate_old_labels_dataset(old_labels) From 663ba9b03c76fec9a28cf8c1ce7d3f5330c97547 Mon Sep 17 00:00:00 2001 From: SietsmaRJ Date: Thu, 29 Sep 2022 13:27:14 +0200 Subject: [PATCH 2/2] Further removed dependency on the % sign. --- utility_scripts/process_vep_tsv.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/utility_scripts/process_vep_tsv.py b/utility_scripts/process_vep_tsv.py index a1ab926..174a7d9 100755 --- a/utility_scripts/process_vep_tsv.py +++ b/utility_scripts/process_vep_tsv.py @@ -84,7 +84,7 @@ def validate_output_cla(output_path): @staticmethod def validate_input_dataset(input_data): - columns_must_be_present = ['%SYMBOL', '%CHROM', '%ID'] + columns_must_be_present = ['SYMBOL', 'CHROM', 'ID'] for column in columns_must_be_present: if column not in input_data.columns: raise DataError(f'Missing required column: {column}') @@ -113,16 +113,16 @@ def main(): print('Dropping entries without gene.') before_drop = data.shape[0] - data.drop(index=data[data['%SYMBOL'].isnull()].index, inplace=True) + data.drop(index=data[data['SYMBOL'].isnull()].index, inplace=True) after_drop = data.shape[0] print(f'Dropped {before_drop-after_drop} variants.\n') if grch38: print('Converting chromosome column') - data['%CHROM'] = data['%CHROM'].str.split('chr', expand=True)[1] + data['CHROM'] = data['CHROM'].str.split('chr', expand=True)[1] y = np.append(np.arange(1, 23).astype(str), ['X', 'Y', 'MT']) before_drop = data.shape[0] - data.drop(data[~data["%CHROM"].isin(y)].index, inplace=True) + data.drop(data[~data["CHROM"].isin(y)].index, inplace=True) after_drop = data.shape[0] print(f'Dropped {before_drop-after_drop} rows due to unknown chromosome.') print('Conversion done.\n') @@ -137,21 +137,21 @@ def main(): print('Dropping mismatching gene entries.') before_drop = data.shape[0] data.drop( - index=data[data['%ID'].str.split(ID_SEPARATOR, expand=True)[4] != data['%SYMBOL']].index, + index=data[data['ID'].str.split(ID_SEPARATOR, expand=True)[4] != data['SYMBOL']].index, inplace=True ) after_drop = data.shape[0] print(f'Dropped {before_drop-after_drop} variants.\n') print('Extracting sample weight and binarized_label') - data['binarized_label'] = data['%ID'].str.split(ID_SEPARATOR, expand=True)[5].astype(float) - data['sample_weight'] = data['%ID'].str.split(ID_SEPARATOR, expand=True)[6].astype(float) + data['binarized_label'] = data['ID'].str.split(ID_SEPARATOR, expand=True)[5].astype(float) + data['sample_weight'] = data['ID'].str.split(ID_SEPARATOR, expand=True)[6].astype(float) print('') print('Correcting possible errors within binarized_label or sample_weight') before_drop = data.shape[0] # Drop everything that doesn't have a binarized_label, also drop unused columns - data.drop(index=data[data['binarized_label'].isnull()].index, columns=['%ID'], inplace=True) + data.drop(index=data[data['binarized_label'].isnull()].index, columns=['ID'], inplace=True) data.drop(index=data[~data['binarized_label'].isin([0.0, 1.0])].index, inplace=True) data.drop(index=data[~data['sample_weight'].isin(SAMPLE_WEIGHTS)].index, inplace=True) after_drop = data.shape[0]