Skip to content

Commit

Permalink
Merge pull request #30 from molgenis/feat/simplified_bcftools
Browse files Browse the repository at this point in the history
Feat/simplified bcftools
  • Loading branch information
svandenhoek authored Oct 4, 2022
2 parents 2e43ad2 + 17c9821 commit fb5ac8f
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 23 deletions.
12 changes: 0 additions & 12 deletions utility_scripts/compare_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -190,16 +190,6 @@ def validate_output_argument(output):
exit(1)


def correct_column_names(columns: typing.Iterable):
processed_columns = []
for column in columns:
if column.startswith('%'):
processed_columns.append(column.split('%')[1])
else:
processed_columns.append(column)
return processed_columns


def split_consequences(consequences: pd.Series):
splitted_consequences = consequences.str.split('&', expand=True)
return pd.Series(splitted_consequences.values.ravel()).dropna().sort_values(
Expand All @@ -208,10 +198,8 @@ def split_consequences(consequences: pd.Series):

def prepare_data_file(validator, scores, labels, model_number, force_merge):
scores_model = pd.read_csv(scores, sep='\t', na_values='.')
scores_model.columns = correct_column_names(scores_model.columns)
validator.validate_score_column_present(scores_model, model_number)
labels_model = pd.read_csv(labels, sep='\t', na_values='.')
labels_model.columns = correct_column_names(labels_model.columns)
validator.validate_bl_column_present(labels_model, model_number)
m_cons = validator.validate_consequence_column_present(labels_model)
if scores_model.shape[0] == labels_model.shape[0]:
Expand Down
3 changes: 0 additions & 3 deletions utility_scripts/compare_to_legacy_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
from matplotlib import pyplot as plt
from sklearn.metrics import roc_curve, auc

from utility_scripts.compare_models import correct_column_names

ID_SEPARATOR = '!'


Expand Down Expand Up @@ -194,7 +192,6 @@ def main():
old_labels = pd.read_csv(old_labels, sep='\t', skiprows=n_skip)
new_scores = pd.read_csv(new_scores, sep='\t')
new_labels = pd.read_csv(new_labels, sep='\t')
new_labels.columns = correct_column_names(new_labels.columns)
print('Validating data')
validator.validate_old_scores_dataset(old_scores)
validator.validate_old_labels_dataset(old_labels)
Expand Down
16 changes: 8 additions & 8 deletions utility_scripts/process_vep_tsv.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def validate_output_cla(output_path):

@staticmethod
def validate_input_dataset(input_data):
columns_must_be_present = ['%SYMBOL', '%CHROM', '%ID']
columns_must_be_present = ['SYMBOL', 'CHROM', 'ID']
for column in columns_must_be_present:
if column not in input_data.columns:
raise DataError(f'Missing required column: {column}')
Expand Down Expand Up @@ -113,16 +113,16 @@ def main():

print('Dropping entries without gene.')
before_drop = data.shape[0]
data.drop(index=data[data['%SYMBOL'].isnull()].index, inplace=True)
data.drop(index=data[data['SYMBOL'].isnull()].index, inplace=True)
after_drop = data.shape[0]
print(f'Dropped {before_drop-after_drop} variants.\n')

if grch38:
print('Converting chromosome column')
data['%CHROM'] = data['%CHROM'].str.split('chr', expand=True)[1]
data['CHROM'] = data['CHROM'].str.split('chr', expand=True)[1]
y = np.append(np.arange(1, 23).astype(str), ['X', 'Y', 'MT'])
before_drop = data.shape[0]
data.drop(data[~data["%CHROM"].isin(y)].index, inplace=True)
data.drop(data[~data["CHROM"].isin(y)].index, inplace=True)
after_drop = data.shape[0]
print(f'Dropped {before_drop-after_drop} rows due to unknown chromosome.')
print('Conversion done.\n')
Expand All @@ -137,21 +137,21 @@ def main():
print('Dropping mismatching gene entries.')
before_drop = data.shape[0]
data.drop(
index=data[data['%ID'].str.split(ID_SEPARATOR, expand=True)[4] != data['%SYMBOL']].index,
index=data[data['ID'].str.split(ID_SEPARATOR, expand=True)[4] != data['SYMBOL']].index,
inplace=True
)
after_drop = data.shape[0]
print(f'Dropped {before_drop-after_drop} variants.\n')

print('Extracting sample weight and binarized_label')
data['binarized_label'] = data['%ID'].str.split(ID_SEPARATOR, expand=True)[5].astype(float)
data['sample_weight'] = data['%ID'].str.split(ID_SEPARATOR, expand=True)[6].astype(float)
data['binarized_label'] = data['ID'].str.split(ID_SEPARATOR, expand=True)[5].astype(float)
data['sample_weight'] = data['ID'].str.split(ID_SEPARATOR, expand=True)[6].astype(float)
print('')

print('Correcting possible errors within binarized_label or sample_weight')
before_drop = data.shape[0]
# Drop everything that doesn't have a binarized_label, also drop unused columns
data.drop(index=data[data['binarized_label'].isnull()].index, columns=['%ID'], inplace=True)
data.drop(index=data[data['binarized_label'].isnull()].index, columns=['ID'], inplace=True)
data.drop(index=data[~data['binarized_label'].isin([0.0, 1.0])].index, inplace=True)
data.drop(index=data[~data['sample_weight'].isin(SAMPLE_WEIGHTS)].index, inplace=True)
after_drop = data.shape[0]
Expand Down

0 comments on commit fb5ac8f

Please sign in to comment.