Updated conversion function

Strexas · Mar 4, 2024 · 08e6f81 · 08e6f81
1 parent 4d2835c
commit 08e6f81
Show file tree

Hide file tree

Showing 2 changed files with 28 additions and 20 deletions.
diff --git a/data_collection/pipeline.py b/data_collection/pipeline.py
@@ -1,7 +1,6 @@
 import pandas as pd
 from pandas import DataFrame, Series
-from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_DNA, convert_lovd_to_datatypes, \
-    LOVD_DATA_TYPES
+from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_DNA, convert_lovd_to_datatype
 
 # CONSTANTS
 # files
@@ -67,8 +66,7 @@ def calculate_max_frequency(row):
 gnomad_data = pd.read_csv(GNOMAD_PATH + "/gnomad_data.csv")
 clinvar_data = pd.read_csv(CLINVAR_PATH + "/clinvar_data.txt", sep='\t')
 
-# Convert LOVD data types
-convert_lovd_to_datatypes(lovd_data)
+convert_lovd_to_datatype(lovd_data)
 
 # renaming databases' columns
 gnomad_data.columns += "(gnomad)"
@@ -94,12 +92,14 @@ def calculate_max_frequency(row):
                       left_on="VariantOnTranscript/DNA",
                       right_on="HGVS Consequence(gnomad)").drop("HGVS Consequence(gnomad)", axis=1)
 
+
 # Calculating frequencies
 lovd_without_association_in_gnomad = pd.isnull(main_frame["Hemizygote Count Remaining(gnomad)"])
 lovd_with_gnomad = main_frame[~lovd_without_association_in_gnomad].copy()
 max_values = lovd_with_gnomad.apply(calculate_max_frequency, axis=1)
 lovd_with_gnomad[['PopMax(gnomad)', 'PopMax population(gnomad)']] = max_values
 
+
 # Leaving necessary columns
 
 lovd_with_gnomad = lovd_with_gnomad.loc[:, ['id',

diff --git a/data_collection/tools.py b/data_collection/tools.py
@@ -4,7 +4,7 @@
 from pandas import DataFrame
 
 
-# Exceptions
+# EXCEPTIONS
 class BadResponseException(Exception):
     pass
 
@@ -232,25 +232,33 @@ def get_file_from_url(url, save_to, override=False):
         print(f"Error: {e}")
 
 
-def convert_lovd_to_datatypes(table):
+def convert_lovd_to_datatype(df_dict):
     """
-    Converts data from LOVD format to the desired data format based on the specified data types.
+    Convert data from LOVD format table to desired data format based on specified data types.
 
-    :param dict[str, tuple[DataFrame, list[str]]] table: dictionary of tables
-    :returns: dictionary of tables with converted data types
-    :rtype: dict[str, tuple[DataFrame, list[str]]]
+    :param dict[str, tuple[DataFrame, list[str]] df_dict: Dictionary of tables where each table is represented by its name
+     and contains a tuple with a DataFrame and a list of notes.
     """
 
-    try:
-        for table_name, (frame, notes) in table.items():
-            if table_name in LOVD_DATA_TYPES:
-                frame = frame.astype(LOVD_DATA_TYPES[table_name])
-
-            table[table_name] = (frame, notes)
-
-        return table
-    except Exception as e:
-        print(f"Error: {e}")
+    for constant_table_name, attributes in LOVD_DATA_TYPES.items():
+        frame, notes = df_dict[constant_table_name]
+        for column, data_type in attributes.items():
+            if column not in frame.columns:
+                continue
+
+            match data_type:
+                case "Date":
+                    frame[column] = pd.to_datetime(frame[column], errors='coerce')
+                case "Boolean":
+                    frame[column] = (frame[column] != 0).astype('bool')
+                case "String":
+                    frame[column] = frame[column].astype('string')
+                case "Integer":
+                    frame[column] = pd.to_numeric(frame[column], errors='coerce').astype('Int64')
+                case "Double":
+                    frame[column] = pd.to_numeric(frame[column], errors='coerce').astype('float')
+                case _:
+                    continue
 
 
 def from_lovd_to_pandas(path):