From 7495452ac8e0bae893699e87bd23edfedbedef36 Mon Sep 17 00:00:00 2001 From: Dainius Date: Wed, 24 Apr 2024 17:11:14 +0300 Subject: [PATCH] setting datetypes fix --- data_collection/__init__.py | 6 +-- data_collection/pipeline.py | 6 +-- data_collection/refactoring.py | 13 +++--- tests/.gitkeep | 0 tests/pipeline.ipynb | 82 ++++++++++++++++++++++++++++++++++ 5 files changed, 95 insertions(+), 12 deletions(-) delete mode 100644 tests/.gitkeep create mode 100644 tests/pipeline.ipynb diff --git a/data_collection/__init__.py b/data_collection/__init__.py index 0921646..5a07c03 100644 --- a/data_collection/__init__.py +++ b/data_collection/__init__.py @@ -52,7 +52,7 @@ # DATA REFACTORING IMPORT from .refactoring import ( # Functions for refactoring data - convert_lovd_to_datatype, - parse_lovd, - from_clinvar_name_to_cdna_position + set_lovd_dtypes, + parse_lovd, + from_clinvar_name_to_cdna_position ) diff --git a/data_collection/pipeline.py b/data_collection/pipeline.py index 4c5b402..41fc138 100644 --- a/data_collection/pipeline.py +++ b/data_collection/pipeline.py @@ -1,10 +1,10 @@ -""" Module executes general pipeline for data collection """ +""" Module executes general pipeline for data collection. FILE WILL BE DEPRECATED. """ import logging import pandas as pd from .collection import store_database_for_eys_gene -from .refactoring import parse_lovd, convert_lovd_to_datatype, from_clinvar_name_to_cdna_position +from .refactoring import parse_lovd, set_lovd_dtypes, from_clinvar_name_to_cdna_position from .constants import (DATA_PATH, LOVD_PATH, GNOMAD_PATH, @@ -67,7 +67,7 @@ def main(): gnomad_data = pd.read_csv(GNOMAD_PATH + "/gnomad_data.csv") clinvar_data = pd.read_csv(CLINVAR_PATH + "/clinvar_data.txt", sep='\t') - convert_lovd_to_datatype(lovd_data) + set_lovd_dtypes(lovd_data) # renaming databases' columns gnomad_data.columns += "(gnomad)" diff --git a/data_collection/refactoring.py b/data_collection/refactoring.py index 9329a62..fb647fd 100644 --- a/data_collection/refactoring.py +++ b/data_collection/refactoring.py @@ -8,7 +8,7 @@ from .constants import LOVD_TABLES_DATA_TYPES -def convert_lovd_to_datatype(df_dict): +def set_lovd_dtypes(df_dict): """ Convert data from LOVD format table to desired data format based on specified data types. @@ -16,24 +16,25 @@ def convert_lovd_to_datatype(df_dict): """ for table_name in df_dict: - frame: DataFrame = df_dict[table_name][0] + frame: DataFrame = df_dict[table_name] for column in frame.columns: if column not in LOVD_TABLES_DATA_TYPES[table_name]: raise ValueError(f"Column {column} is undefined in LOVD_TABLES_DATA_TYPES") match LOVD_TABLES_DATA_TYPES[table_name][column]: case "Date": - frame[column] = pd.to_datetime(frame[column]) + frame[column] = pd.to_datetime(frame[column], errors='coerce') case "Boolean": - frame[column] = (frame[column] != 0).astype('bool') + frame[column] = frame[column].map({"0": False, "1": True}) case "String": frame[column] = frame[column].astype('string') - case "Integer64": + case "Integer": frame[column] = pd.to_numeric(frame[column]).astype('Int64') case "Double": frame[column] = pd.to_numeric(frame[column]).astype('float') case _: - raise ValueError("Undefined data type") + raise ValueError(f"Undefined data type: " + f"{LOVD_TABLES_DATA_TYPES[table_name][column]}") def parse_lovd(path): diff --git a/tests/.gitkeep b/tests/.gitkeep deleted file mode 100644 index e69de29..0000000 diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb new file mode 100644 index 0000000..d2a2fcc --- /dev/null +++ b/tests/pipeline.ipynb @@ -0,0 +1,82 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "initial_id", + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "from data_collection import (store_database_for_eys_gene,\n", + " parse_lovd,\n", + " LOVD_PATH,\n", + " set_lovd_dtypes)" + ] + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "store_database_for_eys_gene(\"lovd\", override=False)" + ], + "metadata": { + "collapsed": false + }, + "id": "f49f7691a27aa7b4", + "execution_count": null + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")\n", + "for i in data:\n", + " print(i)\n", + " display(data[i])" + ], + "metadata": { + "collapsed": false + }, + "id": "cf5c45c0f7b9de0f", + "execution_count": null + }, + { + "cell_type": "code", + "outputs": [], + "source": [ + "set_lovd_dtypes(data)\n", + "for i in data:\n", + " print(i)\n", + " display(data[i].info())" + ], + "metadata": { + "collapsed": false + }, + "id": "ef07740b2fa63e42", + "execution_count": null + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}