Skip to content

Commit

Permalink
Merge pull request #17 from DKI/datetime_fix
Browse files Browse the repository at this point in the history
DKI/setting_datetypes
  • Loading branch information
Strexas authored Apr 24, 2024
2 parents e1c4c26 + 7495452 commit b627850
Show file tree
Hide file tree
Showing 5 changed files with 95 additions and 12 deletions.
6 changes: 3 additions & 3 deletions data_collection/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@
# DATA REFACTORING IMPORT
from .refactoring import (
# Functions for refactoring data
convert_lovd_to_datatype,
parse_lovd,
from_clinvar_name_to_cdna_position
set_lovd_dtypes,
parse_lovd,
from_clinvar_name_to_cdna_position
)
6 changes: 3 additions & 3 deletions data_collection/pipeline.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
""" Module executes general pipeline for data collection """
""" Module executes general pipeline for data collection. FILE WILL BE DEPRECATED. """

import logging
import pandas as pd

from .collection import store_database_for_eys_gene
from .refactoring import parse_lovd, convert_lovd_to_datatype, from_clinvar_name_to_cdna_position
from .refactoring import parse_lovd, set_lovd_dtypes, from_clinvar_name_to_cdna_position
from .constants import (DATA_PATH,
LOVD_PATH,
GNOMAD_PATH,
Expand Down Expand Up @@ -67,7 +67,7 @@ def main():
gnomad_data = pd.read_csv(GNOMAD_PATH + "/gnomad_data.csv")
clinvar_data = pd.read_csv(CLINVAR_PATH + "/clinvar_data.txt", sep='\t')

convert_lovd_to_datatype(lovd_data)
set_lovd_dtypes(lovd_data)

# renaming databases' columns
gnomad_data.columns += "(gnomad)"
Expand Down
13 changes: 7 additions & 6 deletions data_collection/refactoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,32 +8,33 @@

from .constants import LOVD_TABLES_DATA_TYPES

def convert_lovd_to_datatype(df_dict):
def set_lovd_dtypes(df_dict):
"""
Convert data from LOVD format table to desired data format based on specified data types.
:param dict[str, tuple[DataFrame, list[str]] df_dict: Dictionary of tables saved as DataFrame
"""

for table_name in df_dict:
frame: DataFrame = df_dict[table_name][0]
frame: DataFrame = df_dict[table_name]
for column in frame.columns:
if column not in LOVD_TABLES_DATA_TYPES[table_name]:
raise ValueError(f"Column {column} is undefined in LOVD_TABLES_DATA_TYPES")

match LOVD_TABLES_DATA_TYPES[table_name][column]:
case "Date":
frame[column] = pd.to_datetime(frame[column])
frame[column] = pd.to_datetime(frame[column], errors='coerce')
case "Boolean":
frame[column] = (frame[column] != 0).astype('bool')
frame[column] = frame[column].map({"0": False, "1": True})
case "String":
frame[column] = frame[column].astype('string')
case "Integer64":
case "Integer":
frame[column] = pd.to_numeric(frame[column]).astype('Int64')
case "Double":
frame[column] = pd.to_numeric(frame[column]).astype('float')
case _:
raise ValueError("Undefined data type")
raise ValueError(f"Undefined data type: "
f"{LOVD_TABLES_DATA_TYPES[table_name][column]}")


def parse_lovd(path):
Expand Down
Empty file removed tests/.gitkeep
Empty file.
82 changes: 82 additions & 0 deletions tests/pipeline.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,82 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"id": "initial_id",
"metadata": {
"collapsed": true
},
"outputs": [],
"source": [
"from data_collection import (store_database_for_eys_gene,\n",
" parse_lovd,\n",
" LOVD_PATH,\n",
" set_lovd_dtypes)"
]
},
{
"cell_type": "code",
"outputs": [],
"source": [
"store_database_for_eys_gene(\"lovd\", override=False)"
],
"metadata": {
"collapsed": false
},
"id": "f49f7691a27aa7b4",
"execution_count": null
},
{
"cell_type": "code",
"outputs": [],
"source": [
"data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")\n",
"for i in data:\n",
" print(i)\n",
" display(data[i])"
],
"metadata": {
"collapsed": false
},
"id": "cf5c45c0f7b9de0f",
"execution_count": null
},
{
"cell_type": "code",
"outputs": [],
"source": [
"set_lovd_dtypes(data)\n",
"for i in data:\n",
" print(i)\n",
" display(data[i].info())"
],
"metadata": {
"collapsed": false
},
"id": "ef07740b2fa63e42",
"execution_count": null
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.6"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

0 comments on commit b627850

Please sign in to comment.