Skip to content

Commit

Permalink
Updated conversion function
Browse files Browse the repository at this point in the history
  • Loading branch information
KajusC committed Mar 4, 2024
1 parent 4d2835c commit 08e6f81
Show file tree
Hide file tree
Showing 2 changed files with 28 additions and 20 deletions.
8 changes: 4 additions & 4 deletions data_collection/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import pandas as pd
from pandas import DataFrame, Series
from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_DNA, convert_lovd_to_datatypes, \
LOVD_DATA_TYPES
from tools import get_file_from_url, from_lovd_to_pandas, from_clinvar_name_to_DNA, convert_lovd_to_datatype

# CONSTANTS
# files
Expand Down Expand Up @@ -67,8 +66,7 @@ def calculate_max_frequency(row):
gnomad_data = pd.read_csv(GNOMAD_PATH + "/gnomad_data.csv")
clinvar_data = pd.read_csv(CLINVAR_PATH + "/clinvar_data.txt", sep='\t')

# Convert LOVD data types
convert_lovd_to_datatypes(lovd_data)
convert_lovd_to_datatype(lovd_data)

# renaming databases' columns
gnomad_data.columns += "(gnomad)"
Expand All @@ -94,12 +92,14 @@ def calculate_max_frequency(row):
left_on="VariantOnTranscript/DNA",
right_on="HGVS Consequence(gnomad)").drop("HGVS Consequence(gnomad)", axis=1)


# Calculating frequencies
lovd_without_association_in_gnomad = pd.isnull(main_frame["Hemizygote Count Remaining(gnomad)"])
lovd_with_gnomad = main_frame[~lovd_without_association_in_gnomad].copy()
max_values = lovd_with_gnomad.apply(calculate_max_frequency, axis=1)
lovd_with_gnomad[['PopMax(gnomad)', 'PopMax population(gnomad)']] = max_values


# Leaving necessary columns

lovd_with_gnomad = lovd_with_gnomad.loc[:, ['id',
Expand Down
40 changes: 24 additions & 16 deletions data_collection/tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from pandas import DataFrame


# Exceptions
# EXCEPTIONS
class BadResponseException(Exception):
pass

Expand Down Expand Up @@ -232,25 +232,33 @@ def get_file_from_url(url, save_to, override=False):
print(f"Error: {e}")


def convert_lovd_to_datatypes(table):
def convert_lovd_to_datatype(df_dict):
"""
Converts data from LOVD format to the desired data format based on the specified data types.
Convert data from LOVD format table to desired data format based on specified data types.
:param dict[str, tuple[DataFrame, list[str]]] table: dictionary of tables
:returns: dictionary of tables with converted data types
:rtype: dict[str, tuple[DataFrame, list[str]]]
:param dict[str, tuple[DataFrame, list[str]] df_dict: Dictionary of tables where each table is represented by its name
and contains a tuple with a DataFrame and a list of notes.
"""

try:
for table_name, (frame, notes) in table.items():
if table_name in LOVD_DATA_TYPES:
frame = frame.astype(LOVD_DATA_TYPES[table_name])

table[table_name] = (frame, notes)

return table
except Exception as e:
print(f"Error: {e}")
for constant_table_name, attributes in LOVD_DATA_TYPES.items():
frame, notes = df_dict[constant_table_name]
for column, data_type in attributes.items():
if column not in frame.columns:
continue

match data_type:
case "Date":
frame[column] = pd.to_datetime(frame[column], errors='coerce')
case "Boolean":
frame[column] = (frame[column] != 0).astype('bool')
case "String":
frame[column] = frame[column].astype('string')
case "Integer":
frame[column] = pd.to_numeric(frame[column], errors='coerce').astype('Int64')
case "Double":
frame[column] = pd.to_numeric(frame[column], errors='coerce').astype('float')
case _:
continue


def from_lovd_to_pandas(path):
Expand Down

0 comments on commit 08e6f81

Please sign in to comment.