Merge pull request #17 from DKI/datetime_fix

DKI/setting_datetypes
Strexas · Apr 24, 2024 · b627850 · b627850
2 parents e1c4c26 + 7495452
commit b627850
Show file tree

Hide file tree

Showing 5 changed files with 95 additions and 12 deletions.
diff --git a/data_collection/__init__.py b/data_collection/__init__.py
@@ -52,7 +52,7 @@
 # DATA REFACTORING IMPORT
 from .refactoring import (
   # Functions for refactoring data
-  convert_lovd_to_datatype,
-  parse_lovd,
-  from_clinvar_name_to_cdna_position
+    set_lovd_dtypes,
+    parse_lovd,
+    from_clinvar_name_to_cdna_position
 )
diff --git a/data_collection/pipeline.py b/data_collection/pipeline.py
@@ -1,10 +1,10 @@
-""" Module executes general pipeline for data collection """
+""" Module executes general pipeline for data collection. FILE WILL BE DEPRECATED. """
 
 import logging
 import pandas as pd
 
 from .collection import store_database_for_eys_gene
-from .refactoring import parse_lovd, convert_lovd_to_datatype, from_clinvar_name_to_cdna_position
+from .refactoring import parse_lovd, set_lovd_dtypes, from_clinvar_name_to_cdna_position
 from .constants import (DATA_PATH,
                        LOVD_PATH,
                        GNOMAD_PATH,
@@ -67,7 +67,7 @@ def main():
     gnomad_data = pd.read_csv(GNOMAD_PATH + "/gnomad_data.csv")
     clinvar_data = pd.read_csv(CLINVAR_PATH + "/clinvar_data.txt", sep='\t')
 
-    convert_lovd_to_datatype(lovd_data)
+    set_lovd_dtypes(lovd_data)
 
     # renaming databases' columns
     gnomad_data.columns += "(gnomad)"

diff --git a/data_collection/refactoring.py b/data_collection/refactoring.py
@@ -8,32 +8,33 @@
 
 from .constants import LOVD_TABLES_DATA_TYPES
 
-def convert_lovd_to_datatype(df_dict):
+def set_lovd_dtypes(df_dict):
     """
     Convert data from LOVD format table to desired data format based on specified data types.
 
     :param dict[str, tuple[DataFrame, list[str]] df_dict: Dictionary of tables saved as DataFrame
     """
 
     for table_name in df_dict:
-        frame: DataFrame = df_dict[table_name][0]
+        frame: DataFrame = df_dict[table_name]
         for column in frame.columns:
             if column not in LOVD_TABLES_DATA_TYPES[table_name]:
                 raise ValueError(f"Column {column} is undefined in LOVD_TABLES_DATA_TYPES")
 
             match LOVD_TABLES_DATA_TYPES[table_name][column]:
                 case "Date":
-                    frame[column] = pd.to_datetime(frame[column])
+                    frame[column] = pd.to_datetime(frame[column], errors='coerce')
                 case "Boolean":
-                    frame[column] = (frame[column] != 0).astype('bool')
+                    frame[column] = frame[column].map({"0": False, "1": True})
                 case "String":
                     frame[column] = frame[column].astype('string')
-                case "Integer64":
+                case "Integer":
                     frame[column] = pd.to_numeric(frame[column]).astype('Int64')
                 case "Double":
                     frame[column] = pd.to_numeric(frame[column]).astype('float')
                 case _:
-                    raise ValueError("Undefined data type")
+                    raise ValueError(f"Undefined data type: "
+                                     f"{LOVD_TABLES_DATA_TYPES[table_name][column]}")
 
 
 def parse_lovd(path):

diff --git a/tests/.gitkeep b/tests/.gitkeep
diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb
@@ -0,0 +1,82 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "initial_id",
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "from data_collection import (store_database_for_eys_gene,\n",
+    "                             parse_lovd,\n",
+    "                             LOVD_PATH,\n",
+    "                             set_lovd_dtypes)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "store_database_for_eys_gene(\"lovd\", override=False)"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "f49f7691a27aa7b4",
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")\n",
+    "for i in data:\n",
+    "    print(i)\n",
+    "    display(data[i])"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "cf5c45c0f7b9de0f",
+   "execution_count": null
+  },
+  {
+   "cell_type": "code",
+   "outputs": [],
+   "source": [
+    "set_lovd_dtypes(data)\n",
+    "for i in data:\n",
+    "    print(i)\n",
+    "    display(data[i].info())"
+   ],
+   "metadata": {
+    "collapsed": false
+   },
+   "id": "ef07740b2fa63e42",
+   "execution_count": null
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.6"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}