From 08e6f811853da9ae58989e70c1a6d3ef6f1324ec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kajus=20=C4=8Cerniauskas?= Date: Mon, 4 Mar 2024 21:16:38 +0200 Subject: [PATCH] Updated conversion function --- data_collection/pipeline.py | 8 ++++---- data_collection/tools.py | 40 ++++++++++++++++++++++--------------- 2 files changed, 28 insertions(+), 20 deletions(-) diff --git a/data_collection/pipeline.py b/data_collection/pipeline.py index 45d0dbb..bcc1a46 100644 --- a/data_collection/pipeline.py +++ b/data_collection/pipeline.py @@ -1,7 +1,6 @@ import pandas as pd from pandas import DataFrame, Series -from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_DNA, convert_lovd_to_datatypes, \ - LOVD_DATA_TYPES +from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_DNA, convert_lovd_to_datatype # CONSTANTS # files @@ -67,8 +66,7 @@ def calculate_max_frequency(row): gnomad_data = pd.read_csv(GNOMAD_PATH + "/gnomad_data.csv") clinvar_data = pd.read_csv(CLINVAR_PATH + "/clinvar_data.txt", sep='\t') -# Convert LOVD data types -convert_lovd_to_datatypes(lovd_data) +convert_lovd_to_datatype(lovd_data) # renaming databases' columns gnomad_data.columns += "(gnomad)" @@ -94,12 +92,14 @@ def calculate_max_frequency(row): left_on="VariantOnTranscript/DNA", right_on="HGVS Consequence(gnomad)").drop("HGVS Consequence(gnomad)", axis=1) + # Calculating frequencies lovd_without_association_in_gnomad = pd.isnull(main_frame["Hemizygote Count Remaining(gnomad)"]) lovd_with_gnomad = main_frame[~lovd_without_association_in_gnomad].copy() max_values = lovd_with_gnomad.apply(calculate_max_frequency, axis=1) lovd_with_gnomad[['PopMax(gnomad)', 'PopMax population(gnomad)']] = max_values + # Leaving necessary columns lovd_with_gnomad = lovd_with_gnomad.loc[:, ['id', diff --git a/data_collection/tools.py b/data_collection/tools.py index cb67ff5..acf1bdd 100644 --- a/data_collection/tools.py +++ b/data_collection/tools.py @@ -4,7 +4,7 @@ from pandas import DataFrame -# Exceptions +# EXCEPTIONS class BadResponseException(Exception): pass @@ -232,25 +232,33 @@ def get_file_from_url(url, save_to, override=False): print(f"Error: {e}") -def convert_lovd_to_datatypes(table): +def convert_lovd_to_datatype(df_dict): """ - Converts data from LOVD format to the desired data format based on the specified data types. + Convert data from LOVD format table to desired data format based on specified data types. - :param dict[str, tuple[DataFrame, list[str]]] table: dictionary of tables - :returns: dictionary of tables with converted data types - :rtype: dict[str, tuple[DataFrame, list[str]]] + :param dict[str, tuple[DataFrame, list[str]] df_dict: Dictionary of tables where each table is represented by its name + and contains a tuple with a DataFrame and a list of notes. """ - try: - for table_name, (frame, notes) in table.items(): - if table_name in LOVD_DATA_TYPES: - frame = frame.astype(LOVD_DATA_TYPES[table_name]) - - table[table_name] = (frame, notes) - - return table - except Exception as e: - print(f"Error: {e}") + for constant_table_name, attributes in LOVD_DATA_TYPES.items(): + frame, notes = df_dict[constant_table_name] + for column, data_type in attributes.items(): + if column not in frame.columns: + continue + + match data_type: + case "Date": + frame[column] = pd.to_datetime(frame[column], errors='coerce') + case "Boolean": + frame[column] = (frame[column] != 0).astype('bool') + case "String": + frame[column] = frame[column].astype('string') + case "Integer": + frame[column] = pd.to_numeric(frame[column], errors='coerce').astype('Int64') + case "Double": + frame[column] = pd.to_numeric(frame[column], errors='coerce').astype('float') + case _: + continue def from_lovd_to_pandas(path):