From 4d2835c6e3f04486d96d0151f888e18ed4185f3d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Kajus=20=C4=8Cerniauskas?= Date: Mon, 4 Mar 2024 20:19:27 +0200 Subject: [PATCH] Changed datatype function --- data_collection/pipeline.py | 4 ++-- data_collection/tools.py | 29 ----------------------------- 2 files changed, 2 insertions(+), 31 deletions(-) diff --git a/data_collection/pipeline.py b/data_collection/pipeline.py index 5cf52ef..45d0dbb 100644 --- a/data_collection/pipeline.py +++ b/data_collection/pipeline.py @@ -1,6 +1,6 @@ import pandas as pd from pandas import DataFrame, Series -from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_DNA, convert_lovd_to_datatype, \ +from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_DNA, convert_lovd_to_datatypes, \ LOVD_DATA_TYPES # CONSTANTS @@ -68,7 +68,7 @@ def calculate_max_frequency(row): clinvar_data = pd.read_csv(CLINVAR_PATH + "/clinvar_data.txt", sep='\t') # Convert LOVD data types -convert_lovd_to_datatype(from_lovd_to_pandas(LOVD_PATH + "/lovd_data.txt")) +convert_lovd_to_datatypes(lovd_data) # renaming databases' columns gnomad_data.columns += "(gnomad)" diff --git a/data_collection/tools.py b/data_collection/tools.py index aa81520..cb67ff5 100644 --- a/data_collection/tools.py +++ b/data_collection/tools.py @@ -253,35 +253,6 @@ def convert_lovd_to_datatypes(table): print(f"Error: {e}") -def convert_lovd_to_datatype(table): - """ - Convert data from LOVD format table to desired data format based on specified data types. - - :param dict table: Dictionary of tables where each table is represented by its name - and contains a tuple with a DataFrame and a list of notes. - """ - - for constant_table_name, attributes in LOVD_DATA_TYPES.items(): - frame, notes = table[constant_table_name] - for column, data_type in attributes.items(): - if column not in frame.columns: - continue - - match [data_type]: - case ["Date"]: - frame[column] = pd.to_datetime(frame[column], errors='coerce') - case ["Boolean"]: - frame[column] = (frame[column] != 0).astype('bool') - case ["String"]: - frame[column] = frame[column].astype('string') - case ["Integer"]: - frame[column] = pd.to_numeric(frame[column], errors='coerce').astype('Int64') - case ["Double"]: - frame[column] = pd.to_numeric(frame[column], errors='coerce').astype('float') - case _: - continue - - def from_lovd_to_pandas(path): """ Converts data from text file with LOVD format to dictionary of tables. \