diff --git a/0_pre/0_process.ipynb b/0_pre/0_process.ipynb deleted file mode 100644 index 710cde4..0000000 --- a/0_pre/0_process.ipynb +++ /dev/null @@ -1,1767 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Import" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "import numpy as np" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Setup" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data = pd.DataFrame()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "raw = pd.read_csv('1_raw.csv')\n", - "raw" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Logic" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def numeric(name: str, raw_name: str, min: float = np.NaN, max: float = np.NaN):\n", - " unique = raw[raw_name].unique()\n", - " for value in unique:\n", - " try: pd.to_numeric(value)\n", - " except: unique = unique[unique != value]\n", - " print('Non Numeric unique Values:', raw[raw_name][~raw[raw_name].isin(unique)].unique())\n", - "\n", - " raw[raw_name] = pd.to_numeric(raw[raw_name], errors='coerce')\n", - " \n", - " if not np.isnan(min): \n", - " print('Values smaller than min:', raw[raw_name][raw[raw_name] < min].count())\n", - " # raw[raw_name] = raw[raw_name].apply(lambda x: np.NaN if x < min else x)\n", - " raw[raw_name] = raw[raw_name].apply(lambda x: np.NaN if x < min else x)\n", - " if not np.isnan(max): \n", - " print('Values greater than max:', raw[raw_name][raw[raw_name] > max].count())\n", - " # raw[raw_name] = raw[raw_name].apply(lambda x: np.NaN if x > max else x)\n", - " raw[raw_name] = raw[raw_name].apply(lambda x: np.NaN if x > max else x)\n", - "\n", - " o = raw[raw_name]\n", - " o_data = raw[raw['center'] == 0][raw_name]\n", - " o_validation = raw[raw['center'] == 1][raw_name]\n", - " print(f'All: {round(o.count() / len(o) * 100, 2)} % (Mean: {round(o.mean(), 2)}, Std: { round(o.std(), 2) }, Min: { round(o.min(), 2) }, Max: { round(o.max(), 2) })')\n", - " print(f'Data: {round(o_data.count() / len(o_data) * 100, 2)} % (Mean: {round(o_data.mean(), 2)}, Std: { round(o_data.std(), 2) }, Min: { round(o_data.min(), 2) }, Max: { round(o_data.max(), 2) })')\n", - " print(f'Validation: {round(o_validation.count() / len(o_validation) * 100, 2)} % (Mean: {round(o_validation.mean(), 2)}, Std: { round(o_validation.std(), 2) }, Min: { round(o_validation.min(), 2) }, Max: { round(o_validation.max(), 2) })')\n", - "\n", - " data[name] = o.astype(float)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def categorial_numeric_encoding(name: str, raw_name: str, not_supported: list = []):\n", - " raw[raw_name].replace(not_supported, np.NaN, inplace=True)\n", - "\n", - " unique = raw[raw_name].unique()\n", - " unique = unique[~pd.isna(unique)]\n", - "\n", - " try: unique = sorted(unique)\n", - " except TypeError: pass\n", - "\n", - " for i, value in enumerate(unique):\n", - " raw[raw_name] = raw[raw_name].apply(lambda x: i if x == value else x)\n", - " print(i, value, round((raw[raw_name] == i).sum() / len(raw[raw_name]) * 100, 2), '%')\n", - "\n", - " o = raw[raw_name]\n", - " o_data = raw[raw['center'] == 0][raw_name]\n", - " o_validation = raw[raw['center'] == 1][raw_name]\n", - " print(f'All: {round(o.count() / len(o) * 100, 2)} % (Mean: {round(o.mean(), 2)}, Std: { round(o.std(), 2) }, Min: { round(o.min(), 2) }, Max: { round(o.max(), 2) })')\n", - " print(f'Data: {round(o_data.count() / len(o_data) * 100, 2)} % (Mean: {round(o_data.mean(), 2)}, Std: { round(o_data.std(), 2) }, Min: { round(o_data.min(), 2) }, Max: { round(o_data.max(), 2) })')\n", - " print(f'Validation: {round(o_validation.count() / len(o_validation) * 100, 2)} % (Mean: {round(o_validation.mean(), 2)}, Std: { round(o_validation.std(), 2) }, Min: { round(o_validation.min(), 2) }, Max: { round(o_validation.max(), 2) })')\n", - " \n", - " data[name] = o.astype('Int64')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def categorial_one_hot_encoding(name: str, raw_name: str, mapping: dict = {}, not_supported: list = []):\n", - " raw[raw_name].replace(mapping, inplace=True)\n", - " raw[raw_name].replace(not_supported, np.NaN, inplace=True)\n", - "\n", - " unique = raw[raw_name].unique()\n", - " unique = unique[~pd.isna(unique)]\n", - "\n", - " try: unique = sorted(unique)\n", - " except TypeError: pass\n", - " \n", - " for value in unique:\n", - " print(value, round((raw[raw_name] == value).sum() / len(raw[raw_name]) * 100, 2), '%')\n", - " \n", - " o = raw[raw_name]\n", - " o_data = raw[raw['center'] == 0][raw_name]\n", - " o_validation = raw[raw['center'] == 1][raw_name]\n", - " print(f'All: {round(o.count() / len(o) * 100, 2)} %')\n", - " print(f'Data: {round(o_data.count() / len(o_data) * 100, 2)} %')\n", - " print(f'Validation: {round(o_validation.count() / len(o_validation) * 100, 2)} %')\n", - " \n", - " data[name] = o.astype(str)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def timestamp(name: str, raw_name: str):\n", - " raw[raw_name]= pd.to_datetime(raw[raw_name], errors='coerce')\n", - "\n", - " o = raw[raw_name]\n", - " o_data = raw[raw['center'] == 0][raw_name]\n", - " o_validation = raw[raw['center'] == 1][raw_name]\n", - " print(f'All: {round(o.count() / len(o) * 100, 2)} % (Min: { o.min() }, Max: { o.max() })')\n", - " print(f'Data: {round(o_data.count() / len(o_data) * 100, 2)} % (Min: { o_data.min() }, Max: { o_data.max() })')\n", - " print(f'Validation: {round(o_validation.count() / len(o_validation) * 100, 2)} % (Min: { o_validation.min() }, Max: { o_validation.max() })')\n", - "\n", - " data[name] = o" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Populate" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Identifier: Cohort\n", - "raw['center'].replace({ 'CVK': 0, 'CCM': 0, 'CBF': 1 }, inplace=True)\n", - "categorial_numeric_encoding('identifier_cohort', raw_name='center')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Identifier: Organ System\n", - "raw['organ_system'] = raw['organ'].replace({ 'Esophagus': 0, 'Gastric': 1, 'Colorectal': 2, 'Small_intestine': 2, 'Liver': 3, 'Pancreas': 4 })\n", - "categorial_numeric_encoding('identifier_organ_system', raw_name='organ_system', not_supported=['Other_organ_system', 'Transplant'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Meta" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Case\n", - "numeric('meta_case', raw_name='case', min=0)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Patient\n", - "numeric('meta_patient', raw_name='patient', min=0)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Incision\n", - "timestamp('meta_incision', raw_name='op_schnitt')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Suture\n", - "timestamp('meta_suture', raw_name='op_naht')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Year\n", - "raw['meta_year'] = data['meta_incision'].copy().dt.year\n", - "categorial_numeric_encoding('meta_year', raw_name='meta_year', not_supported=[2023])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Meta: System\n", - "raw['meta_system'] = raw['organ'].replace({ 'Esophagus': 0, 'Gastric': 1, 'Colorectal': 2, 'Small_intestine': 2, 'Liver': 3, 'Pancreas': 4 })\n", - "categorial_numeric_encoding('meta_system', raw_name='meta_system', not_supported=['Other_organ_system', 'Transplant'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Meta: OPS\n", - "numeric('meta_ops', raw_name='is_majorop')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Meta: Age\n", - "numeric('meta_age', raw_name='ageatsurgery')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Meta: Urgency\n", - "categorial_numeric_encoding('meta_urgency', raw_name='urgency', not_supported=['keine'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Meta: Resurgery\n", - "categorial_numeric_encoding('meta_resurgery', raw_name='previous_op')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Feature" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### General" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Gender\n", - "raw['patient_sex'].replace({ 'male': 0, 'female': 1 }, inplace=True)\n", - "categorial_numeric_encoding('gender', raw_name='patient_sex')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Age\n", - "numeric('age', raw_name='ageatsurgery', min=18, max=120)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Height\n", - "numeric('height', raw_name='height_in_cm', min=100, max=250)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Weight\n", - "numeric('weight', raw_name='pre_op_weight_in_kg', min=25, max=300)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# BMI\n", - "numeric('bmi', raw_name='bmi', min=5, max=100)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# ASA\n", - "categorial_numeric_encoding('asa', raw_name='asa_classification')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# ECOG\n", - "categorial_numeric_encoding('ecog', raw_name='ecog')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# CCI\n", - "categorial_numeric_encoding('cci', raw_name='cci')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Condition" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Myocardial Infarction\n", - "categorial_numeric_encoding('myocardial_infarction', raw_name='myocardial_infarction')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Congestive Heart Failure\n", - "categorial_numeric_encoding('congestive_heart_failure', raw_name='congestive_heart_failure')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Peripheral Vascular Disease\n", - "categorial_numeric_encoding('peripheral_vascular_disease', raw_name='peripheral_vascular_disease')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Cerebrovascular Disease\n", - "categorial_numeric_encoding('cerebrovascular_disease', raw_name='cerebrovascular_disease')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Dementia\n", - "categorial_numeric_encoding('dementia', raw_name='dementia')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Chronic Pulmonary Disease\n", - "categorial_numeric_encoding('chronic_pulmonary_disease', raw_name='chronic_pulmonary_disease')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Rheumatic Disease\n", - "categorial_numeric_encoding('rheumatic_disease', raw_name='rheumathic_disease')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Peptic Ulcer Disease\n", - "categorial_numeric_encoding('peptic_ulcer_disease', raw_name='peptic_ulcer_disease')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Liver Disease (Mild)\n", - "categorial_numeric_encoding('liver_disease_mild', raw_name='mild_liver_disease')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Liver Disease (Moderate to Severe)\n", - "categorial_numeric_encoding('liver_disease_moderate_to_severe', raw_name='moderate_or_severe_liver_disease')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Diabetes (without Chronic Complications)\n", - "categorial_numeric_encoding('diabetes_without_chronic_complications', raw_name='diabetes_without_chronic_complications')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Diabetes (with Chronic Complications)\n", - "categorial_numeric_encoding('diabetes_with_chronic_complications', raw_name='diabetes_with_chronic_complications')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Hemiplegia or Paraplegia\n", - "categorial_numeric_encoding('hemiplegia_or_paraplegia', raw_name='hemiplegia_or_paraplegia')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Renal Disease\n", - "categorial_numeric_encoding('renal_disease', raw_name='renal_disease')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Malignancy\n", - "categorial_numeric_encoding('malignancy', raw_name='any_malignancy')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Metastatic Solid Tumor\n", - "categorial_numeric_encoding('metastatic_solid_tumor', raw_name='metastatic_solid_tumor')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# AIDS\n", - "categorial_numeric_encoding('aids', raw_name='aids_hiv')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Cardiac Arythmia\n", - "categorial_numeric_encoding('cardiac_arythmia', raw_name='cardiac_arrythmias')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Valvular Disease\n", - "categorial_numeric_encoding('valvular_disease', raw_name='valvular_disease')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Pulmonary Circulatory Disorder\n", - "categorial_numeric_encoding('pulmonary_circulatory_disorder', raw_name='pulmonary_circulation_disorder')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Arterial Hypertension\n", - "categorial_numeric_encoding('arterial_hypertension', raw_name='arterial_hypertension')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Other Neurological Disorders\n", - "categorial_numeric_encoding('other_neurological_disorders', raw_name='other_neurological_disorders')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Hypothyroidism\n", - "categorial_numeric_encoding('hypothyroidism', raw_name='hypothyroidism')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Coagulopathy\n", - "categorial_numeric_encoding('coagulopathy', raw_name='coagulopathy')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Obesity\n", - "categorial_numeric_encoding('obesity', raw_name='obesity')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Weight Loss\n", - "categorial_numeric_encoding('weight_loss', raw_name='weight_loss')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Fluid and Electrolyte Disorders\n", - "categorial_numeric_encoding('fluid_and_electrolyte_disorders', raw_name='fluid_and_electrolyte_disorders')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Blood Loss Anemia\n", - "categorial_numeric_encoding('blood_loss_anemia', raw_name='blood_loss_anemia')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Deficiency Anemia\n", - "categorial_numeric_encoding('deficiency_anemia', raw_name='deficiency_anemia')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Alcohol Abuse\n", - "categorial_numeric_encoding('alcohol_abuse', raw_name='alcohol_abuse')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Drug Abuse\n", - "categorial_numeric_encoding('drug_abuse', raw_name='drug_abuse')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Psychoses\n", - "categorial_numeric_encoding('psychoses', raw_name='psychoses')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Depression\n", - "categorial_numeric_encoding('depression', raw_name='depression')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Coronary Heart Disease\n", - "categorial_numeric_encoding('coronary_heart_disease', raw_name='khk')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Chronic Pancreatitis\n", - "categorial_numeric_encoding('chronic_pancreatitis', raw_name='chronic_pancreatitis')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Ulcerative Colitis\n", - "categorial_numeric_encoding('ulcerative_colitis', raw_name='colitis_ulcerosa')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Crohn's Disease\n", - "categorial_numeric_encoding('crohns_disease', raw_name='crohns_disease')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Surgery" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Primary System\n", - "categorial_one_hot_encoding('primary_system', raw_name='organ', mapping={'Esophagus': 'esophagus', 'Gastric': 'stomach', 'Colorectal': 'intestine', 'Small_intestine': 'intestine', 'Liver': 'liver', 'Pancreas': 'pancreas'}, not_supported=['Other_organ_system', 'Transplant'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# System Esophagus\n", - "categorial_numeric_encoding('system_esophagus', raw_name='surgery_system_esophagus')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# System Stomach\n", - "categorial_numeric_encoding('system_stomach', raw_name='surgery_system_stomach')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# System Intestine\n", - "categorial_numeric_encoding('system_intestine', raw_name='surgery_system_intestine')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# System Liver\n", - "categorial_numeric_encoding('system_liver', raw_name='surgery_system_liver')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# System Pancreas\n", - "categorial_numeric_encoding('system_pancreas', raw_name='surgery_system_pancreas')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# System Count\n", - "numeric('system_count', raw_name='surgery_system_count', min=1, max=5)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Urgency\n", - "categorial_numeric_encoding('urgency', raw_name='urgency')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Resurgery\n", - "categorial_numeric_encoding('resurgery', raw_name='previous_op')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Month\n", - "raw['surgery_month'] = data['meta_incision'].copy().dt.month\n", - "categorial_numeric_encoding('month', raw_name='surgery_month')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Weekday\n", - "raw['surgery_weekday'] = data['meta_incision'].copy().dt.weekday\n", - "categorial_numeric_encoding('weekday', raw_name='surgery_weekday')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Daytime\n", - "raw['surgery_daytime'] = data['meta_incision'].copy().dt.hour\n", - "raw['surgery_daytime'] = raw['surgery_daytime'].apply(lambda x: 0 if (x >= 0 and x < 6) else x)\n", - "raw['surgery_daytime'] = raw['surgery_daytime'].apply(lambda x: 1 if (x >= 6 and x < 14) else x)\n", - "raw['surgery_daytime'] = raw['surgery_daytime'].apply(lambda x: 2 if (x >= 14 and x < 22) else x)\n", - "raw['surgery_daytime'] = raw['surgery_daytime'].apply(lambda x: 0 if (x >= 22 and x < 24) else x)\n", - "categorial_numeric_encoding('daytime', raw_name='surgery_daytime')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Hour\n", - "raw['surgery_hour'] = data['meta_incision'].copy().dt.hour\n", - "categorial_numeric_encoding('hour', raw_name='surgery_hour')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Mean Monthly Temperature\n", - "numeric('mean_monthly_temperature', raw_name='mean_monthly_temperature[°c]', min=-50, max=50)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Minimum Monthly Temperature\n", - "numeric('minimum_monthly_temperature', raw_name='min_monthly_temperature[°c]', min=-50, max=50)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Maximum Monthly Temperature\n", - "numeric('maximum_monthly_temperature', raw_name='max_monthly_temperature[°c]', min=-50, max=50)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Monthly Precipitation\n", - "numeric('monthly_precipitation', raw_name='mean_monthly_rainfall[l/m2]', min=0, max=500)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Monthly Sunshine Hours\n", - "numeric('monthly_sunshine_hours', raw_name='sunshine[h]', min=0, max=744)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Lab" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Sodium\n", - "numeric('sodium', raw_name='sodium', min=50, max=200)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Potassium\n", - "numeric('potassium', raw_name='potassium', min=1, max=10)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Albumin\n", - "numeric('albumin', raw_name='albumin', min=0, max=100)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Bilirubin (Direct)\n", - "numeric('bilirubin_direct', raw_name='bilirubin_direct', min=0, max=50)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Bilirubin (Indirect)\n", - "numeric('bilirubin_indirect', raw_name='bilirubin_indirect', min=0, max=50)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Bilirubin\n", - "numeric('bilirubin', raw_name='bilirubin', min=0, max=50)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Urea\n", - "numeric('urea', raw_name='urea', min=0, max=400)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# TSH\n", - "numeric('tsh', raw_name='tsh', min=0, max=50)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# ALAT / ALT / GPT\n", - "numeric('alt', raw_name='alt', min=0, max=100000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# ASAT / AST / GOT\n", - "numeric('ast', raw_name='ast', min=0, max=100000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# gGT\n", - "numeric('ggt', raw_name='ggt', min=0, max=100000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Lipase\n", - "numeric('lipase', raw_name='lipase', min=0, max=50000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# CRP\n", - "numeric('crp', raw_name='crp', min=0, max=1000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Lactate\n", - "numeric('lactate', raw_name='lactate', min=0, max=200)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# HbA1C\n", - "numeric('hba1c', raw_name='hba1c', min=0, max=100)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Hemoglobin\n", - "numeric('hemoglobin', raw_name='hemoglobin', min=0, max=40)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# WBC\n", - "numeric('wbc', raw_name='wbc', min=0, max=100)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Platelets\n", - "numeric('platelets', raw_name='platelets', min=0, max=16000000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Hematocrit\n", - "raw['hematocrit'] = raw['hematocrit'].apply(lambda x: x / 100 if x > 1 else x)\n", - "numeric('hematocrit', raw_name='hematocrit', min=0, max=1)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Prothrombin Time\n", - "numeric('prothrombin_time', raw_name='prothrombin_time', min=0, max=200)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# INR\n", - "numeric('inr', raw_name='inr', min=0, max=10)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# aPTT\n", - "numeric('aptt', raw_name='aptt', min=0, max=500)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Erythrocytes\n", - "numeric('erythrocytes', raw_name='erythrocytes', min=0, max=10)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Creatinine\n", - "numeric('creatinine', raw_name='creatinine', min=0, max=100)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Glucose\n", - "numeric('glucose', raw_name='glucose', min=0, max=1000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# pH\n", - "numeric('ph', raw_name='abg_ph', min=6, max=8)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Base Excess\n", - "numeric('base_excess', raw_name='base_excess', min=-50, max=50)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Bicarbonate\n", - "numeric('bicarbonate', raw_name='abg_bicarbonate', min=0, max=50)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Alpha Fetoprotein\n", - "numeric('alpha_fetoprotein', raw_name='alpha_fetoprotein', min=0, max=10000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# CA 19-9\n", - "numeric('ca_19_9', raw_name='ca_19_9', min=0, max=10000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# CA 125\n", - "numeric('ca_125', raw_name='ca_125', min=0, max=10000)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# CEA\n", - "numeric('cea', raw_name='cea', min=0, max=10000)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Target" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Target: 30 Day Mortality\n", - "categorial_numeric_encoding('target_30_day_mortality', raw_name='30_day_mortality')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Target: 90 Day Mortality\n", - "categorial_numeric_encoding('target_90_day_mortality', raw_name='90_day_mortality')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Target: Deceased after Discharge\n", - "categorial_numeric_encoding('target_deceased_after_discharge', raw_name='deceased_after_discharge')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Target: Clavien Dindo 5\n", - "categorial_numeric_encoding('target_clavien_dindo_5', raw_name='clavien_dindo_v')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Target: Acute Myocardial Infarction\n", - "categorial_numeric_encoding('target_acute_myocardial_infarction', raw_name='acute_myocardial_infarction')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Target: Pulmonary Embolism\n", - "categorial_numeric_encoding('target_pulmonary_embolism', raw_name='pulmonary_embolism')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Target: Septic Shock\n", - "categorial_numeric_encoding('target_septic_shock', raw_name='septic_shock')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Target: Pneumonia\n", - "categorial_numeric_encoding('target_pneumonia', raw_name='pneumonia')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Target: Liver Failure\n", - "categorial_numeric_encoding('target_liver_failure', raw_name='liver_failure')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Target: Cardiogenic Shock\n", - "categorial_numeric_encoding('target_cardiogenic_shock', raw_name='cardiogenic_shock')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Target: Acute Pancreatitis\n", - "categorial_numeric_encoding('target_acute_pancreatitis', raw_name='acute_pancreatitis')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Target: Acute Respiratory Failure\n", - "categorial_numeric_encoding('target_acute_respiratory_failure', raw_name='acute_respiratory_failure')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Process" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Sort\n", - "data.sort_values(by=['meta_incision'], inplace=True)\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Duplicates\n", - "data.drop_duplicates(subset='meta_case', inplace=True)\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Feature Count\n", - "print('Feature Count:', data.filter(regex='^(?!identifier_|meta_|target_)').columns.size)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Inclusion Criteria" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Year\n", - "data = data[data['meta_year'].notna()]\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# System\n", - "data = data[data['meta_system'].notna()]\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# OPS\n", - "data = data[data['meta_ops'] == 1]\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Urgency\n", - "data = data[data['meta_urgency'] >= 4]\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Age\n", - "data = data[data['meta_age'] >= 18]\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(len(data[data['identifier_cohort'] == 0]))\n", - "print(len(data[data['identifier_cohort'] == 1]))" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Column Management" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "center0 = data[data['identifier_cohort'] == 0]\n", - "for column in center0.columns:\n", - " if column == 'identifier_cohort': continue\n", - " if center0[column].nunique() <= 1:\n", - " data.drop(column, axis=1, inplace=True)\n", - " print(column)\n", - "\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "center0 = data[data['identifier_cohort'] == 0]\n", - "\n", - "completeness = data.count() / len(data) * 100\n", - "completeness = completeness.apply(lambda x: round(x / 10) * 10)\n", - "completeness = completeness.value_counts().sort_index()\n", - "completeness.plot.bar()" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "for column in center0.columns:\n", - " if center0[column].isna().sum() / len(center0) > 0.5:\n", - " data.drop(column, axis=1, inplace=True)\n", - " print(column)\n", - "\n", - "data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Exclusion Criteria" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# 30 Day Mortality\n", - "data = data[data['target_30_day_mortality'].notna()]\n", - "\n", - "print(len(data[data['identifier_cohort'] == 0]))\n", - "print(len(data[data['identifier_cohort'] == 1]))\n", - "\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# 90 Day Mortality\n", - "data = data[data['target_90_day_mortality'].notna()]\n", - "\n", - "print(len(data[data['identifier_cohort'] == 0]))\n", - "print(len(data[data['identifier_cohort'] == 1]))\n", - "\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Completeness\n", - "data.dropna(thresh=len(data.columns) * 0.75, inplace=True)\n", - "\n", - "print(len(data[data['identifier_cohort'] == 0]))\n", - "print(len(data[data['identifier_cohort'] == 1]))\n", - "\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data = data.loc[:, ~data.columns.str.startswith('meta_')]\n", - "data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.to_csv('3_final.csv', index = False)\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(len(data[data['identifier_cohort'] == 0]))\n", - "print(len(data[data['identifier_cohort'] == 1]))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "description: pd.DataFrame = pd.DataFrame()\n", - "\n", - "i = 0\n", - "for column in data.columns:\n", - " data_total = data[column]\n", - " data_training = data[data['identifier_cohort'] == 0][column]\n", - " data_validation = data[data['identifier_cohort'] == 1][column]\n", - "\n", - " description = description.append({\n", - " 'id': i,\n", - " 'name': column,\n", - " 'dimension': '',\n", - " 'lower_limit': '',\n", - " 'upper_limit': '',\n", - " 'type': data_total.dtype,\n", - " #'count': data_total.count(),\n", - " #'count (0)': data_training.count(),\n", - " #'count (1)': data_validation.count(),\n", - " #'completeness': round(data_total.count() / len(data_total) * 100, 2),\n", - " #'completeness (0)': round(data_training.count() / len(data_training) * 100, 2),\n", - " #'completeness (1)': round(data_validation.count() / len(data_validation) * 100, 2),\n", - " #'min': data_total.min(),\n", - " #'min (0)': data_training.min(),\n", - " #'min (1)': data_validation.min(),\n", - " #'max': data_total.max(),\n", - " #'max (0)': data_training.max(),\n", - " #'max (1)': data_validation.max(),\n", - " #'unique': data_total.nunique(),\n", - " #'unique (0)': data_training.nunique(),\n", - " #'unique (1)': data_validation.nunique(),\n", - " }, ignore_index=True)\n", - "\n", - " i += 1\n", - "\n", - "description.to_csv('4_description.csv', index=False)\n", - "description" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(description.to_markdown(index=False))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Feature Count\n", - "print('Feature Count:', data.filter(regex='^(?!identifier_|meta_|target_)').columns.size)\n", - "print(data.columns)" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3.11.5 64-bit ('anaconda3')", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - }, - "orig_nbformat": 4, - "vscode": { - "interpreter": { - "hash": "5f19079be71ebf6901568ccbd4ed5155aa379cf24d838b537abf2f3af9c2c3a9" - } - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/0_pre/1_analyze.ipynb b/0_pre/1_analyze.ipynb deleted file mode 100644 index ccc1378..0000000 --- a/0_pre/1_analyze.ipynb +++ /dev/null @@ -1,243 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Import" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Load" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data = pd.read_csv('5_final.csv')\n", - "data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Logic" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def evaluate_numeric(column: str):\n", - " training_column = data[data['identifier_cohort'] == 0][column]\n", - " validation_column = data[data['identifier_cohort'] == 1][column]\n", - "\n", - " print(f\"ALL: {data[column].mean()} (Mean); {data[column].std()} (SD)\")\n", - " print(f\"TRAINING: {training_column.mean()} (Mean); {training_column.std()} (SD)\")\n", - " print(f\"VALIDATION: {validation_column.mean()} (Mean); {validation_column.std()} (SD)\")" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def evaluate_categorial(column: str):\n", - " training_column = data[data['identifier_cohort'] == 0][column]\n", - " validation_column = data[data['identifier_cohort'] == 1][column]\n", - "\n", - " print(f\"ALL: {data['asa'].count()} (N); {data['asa'].count() / len(data['asa'])} (%)\")\n", - " print(data[column].value_counts()); print(data[column].value_counts(normalize=True))\n", - " print(f\"TRAINING: {training_column.count()} (N); {training_column.count() / len(training_column)} (%)\")\n", - " print(training_column.value_counts()); print(training_column.value_counts(normalize=True))\n", - " print(f\"VALIDATION: {validation_column.count()} (N); {validation_column.count() / len(validation_column)} (%)\")\n", - " print(validation_column.value_counts()); print(validation_column.value_counts(normalize=True))\n", - " " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Analysis" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Collective" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Overall Completeness" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "pure_data = data.drop(columns=[col for col in data.columns if col.startswith('identifier_')]) # also drop additional targets?\n", - "pure_training_data = pure_data[data['identifier_cohort'] == 0]\n", - "pure_validation_data = pure_data[data['identifier_cohort'] == 1]\n", - "\n", - "print(f'[All] {pure_data.count().sum()} (N); {pure_data.count().sum() / pure_data.size} (%)')\n", - "print(f'[Training] {pure_training_data.count().sum()} (N); {pure_training_data.count().sum() / pure_training_data.size} (%)')\n", - "print(f'[Validation] {pure_validation_data.count().sum()} (N); {pure_validation_data.count().sum() / pure_validation_data.size} (%)')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Gender\n", - "evaluate_categorial('gender')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Age\n", - "evaluate_numeric('age')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "evaluate_numeric('height')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "evaluate_numeric('weight')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# print the percentage of values present (not nan) in data['asa']\n", - "print(data['asa'].count() / len(data['asa']))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "evaluate_categorial('asa')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "evaluate_numeric('cci')" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Subgroups" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/0_pre/README.md b/0_pre/README.md index 55decbe..0cfa202 100644 --- a/0_pre/README.md +++ b/0_pre/README.md @@ -1,131 +1,112 @@ # Preoperative Data -The preoperative data set contains static medical data at the time before the index operation. Data is available from two different and independent clinics so that results can be validated externally. The dataset contains 13,225 individual cases, each with a total of 79 features and 12 potential targets. +The preoperative data set contains static medical data at the time before the index operation. Data is available from two different and independent clinics so that results can be validated externally. The dataset contains 13,225 individual cases, each with a total of 10 meta tags, 79 features and 12 targets. ## Data Structure -**Type Legend:** 0=numeric, 1=binary / numeric encoded categorial, 2=on-hot encoded categorial - -| id | name | dimension | lower_limit | upper_limit | type | -| :--- | :------------------------------------- | :-------- | :---------- | :------------ | :--- | -| 0 | identifier_cohort | - | 0 | 1 | 1 | -| 1 | identifier_organ_system | - | 0 | 4 | 1 | -| 2 | gender | - | 0 | 1 | 1 | -| 3 | age | yrs | 18 | 120 | 0 | -| 4 | height | cm | 100 | 250 | 0 | -| 5 | weight | kg | 25 | 300 | 0 | -| 6 | bmi | kg / m^2 | 5 | 100 | 0 | -| 7 | asa | - | 0 | 5 | 1 | -| 8 | ecog | - | 0 | 5 | 1 | -| 9 | cci | - | 0 | 24 | 1 | -| 10 | myocardial_infarction | - | 0 | 1 | 1 | -| 11 | congestive_heart_failure | - | 0 | 1 | 1 | -| 12 | peripheral_vascular_disease | - | 0 | 1 | 1 | -| 13 | cerebrovascular_disease | - | 0 | 1 | 1 | -| 14 | dementia | - | 0 | 1 | 1 | -| 15 | chronic_pulmonary_disease | - | 0 | 1 | 1 | -| 16 | rheumatic_disease | - | 0 | 1 | 1 | -| 17 | peptic_ulcer_disease | - | 0 | 1 | 1 | -| 18 | liver_disease_mild | - | 0 | 1 | 1 | -| 19 | liver_disease_moderate_to_severe | - | 0 | 1 | 1 | -| 20 | diabetes_without_chronic_complications | - | 0 | 1 | 1 | -| 21 | diabetes_with_chronic_complications | - | 0 | 1 | 1 | -| 22 | hemiplegia_or_paraplegia | - | 0 | 1 | 1 | -| 23 | renal_disease | - | 0 | 1 | 1 | -| 24 | malignancy | - | 0 | 1 | 1 | -| 25 | metastatic_solid_tumor | - | 0 | 1 | 1 | -| 26 | aids | - | 0 | 1 | 1 | -| 27 | cardiac_arythmia | - | 0 | 1 | 1 | -| 28 | valvular_disease | - | 0 | 1 | 1 | -| 29 | pulmonary_circulatory_disorder | - | 0 | 1 | 1 | -| 30 | arterial_hypertension | - | 0 | 1 | 1 | -| 31 | other_neurological_disorders | - | 0 | 1 | 1 | -| 32 | hypothyroidism | - | 0 | 1 | 1 | -| 33 | coagulopathy | - | 0 | 1 | 1 | -| 34 | obesity | - | 0 | 1 | 1 | -| 35 | weight_loss | - | 0 | 1 | 1 | -| 36 | fluid_and_electrolyte_disorders | - | 0 | 1 | 1 | -| 37 | blood_loss_anemia | - | 0 | 1 | 1 | -| 38 | deficiency_anemia | - | 0 | 1 | 1 | -| 39 | alcohol_abuse | - | 0 | 1 | 1 | -| 40 | drug_abuse | - | 0 | 1 | 1 | -| 41 | psychoses | - | 0 | 1 | 1 | -| 42 | depression | - | 0 | 1 | 1 | -| 43 | coronary_heart_disease | - | 0 | 1 | 1 | -| 44 | chronic_pancreatitis | - | 0 | 1 | 1 | -| 45 | ulcerative_colitis | - | 0 | 1 | 1 | -| 46 | crohns_disease | - | 0 | 1 | 1 | -| 47 | primary_system | - | - | - | 2 | -| 48 | system_esophagus | - | 0 | 1 | 1 | -| 49 | system_stomach | - | 0 | 1 | 1 | -| 50 | system_intestine | - | 0 | 1 | 1 | -| 51 | system_liver | - | 0 | 1 | 1 | -| 52 | system_pancreas | - | 0 | 1 | 1 | -| 53 | system_count | - | 1 | 5 | 0 | -| 54 | urgency | - | 0 | 5 | 1 | -| 55 | resurgery | - | 0 | 1 | 1 | -| 56 | month | - | 0 | 11 | 1 | -| 57 | weekday | - | 0 | 6 | 1 | -| 58 | daytime | - | 0 | 2 | 1 | -| 59 | hour | - | 0 | 23 | 1 | -| 60 | mean_monthly_temperature | °C | -50 | 50 | 0 | -| 61 | minimum_monthly_temperature | °C | -50 | 50 | 0 | -| 62 | maximum_monthly_temperature | °C | -50 | 50 | 0 | -| 63 | monthly_precipitation | l / m^2 | 0 | 500 | 0 | -| 64 | monthly_sunshine_hours | h | 0 | 744 | 0 | -| 65 | sodium | mmol / l | 50 | 200 | 0 | -| 66 | potassium | mmol / l | 1 | 10 | 0 | -| 67 | bilirubin | mg / dl | 0 | 50 | 0 | -| 68 | urea | mg / dl | 0 | 400 | 0 | -| 69 | ggt | U / l | 0 | 100,000 | 0 | -| 70 | lipase | U / l | 0 | 50,000 | 0 | -| 71 | crp | mg / l | 0 | 1,000 | 0 | -| 72 | hemoglobin | g / dl | 0 | 40 | 0 | -| 73 | wbc | / nl | 0 | 100 | 0 | -| 74 | platelets | / nl | 0 | 16,000,000 | 0 | -| 75 | hematocrit | l / l | 0 | 1 | 0 | -| 76 | inr | - | 0 | 10 | 0 | -| 77 | aptt | s | 0 | 500 | 0 | -| 78 | erythrocytes | / pl | 0 | 10 | 0 | -| 79 | creatinine | mg / dl | 0 | 100 | 0 | -| 80 | glucose | mg / dl | 0 | 1,000 | 0 | -| 81 | target_30_day_mortality | - | 0 | 1 | 1 | -| 82 | target_90_day_mortality | - | 0 | 1 | 1 | -| 83 | target_deceased_after_discharge | - | 0 | 1 | 1 | -| 84 | target_clavien_dindo_v | - | 0 | 1 | 1 | -| 85 | target_acute_myocardial_infarction | - | 0 | 1 | 1 | -| 86 | target_pulmonary_embolism | - | 0 | 1 | 1 | -| 87 | target_septic_shock | - | 0 | 1 | 1 | -| 98 | target_pneumonia | - | 0 | 1 | 1 | -| 89 | target_liver_failure | - | 0 | 1 | 1 | -| 90 | target_cardiogenic_shock | - | 0 | 1 | 1 | -| 91 | target_acute_pancreatitis | - | 0 | 1 | 1 | -| 92 | target_acute_respiratory_failure | - | 0 | 1 | 1 | - - -## Filtered Data Sets - -The following exclusion criteria were applied to the following data sets: - -### Elective - -Exclusion Criteria: -- **meta_urgency:** <= 3 -- **target_30_day_mortality:** not available -- **target_90_day_mortality:** not available -- **Completeness:** < 75 % - -Output: 7,711 cases - -### Emergency - -Exclusion Criteria: -- **meta_urgency:** >= 4 -- **target_30_day_mortality:** not available -- **target_90_day_mortality:** not available -- **Completeness:** < 50 % - -Output: 1,273 cases +| id | name | type | count | count (0) | count (1) | completeness | completeness (0) | completeness (1) | min | min (0) | min (1) | max | max (0) | max (1) | unique | unique (0) | unique (1) | +|-----:|:---------------------------------------|:---------------|--------:|------------:|------------:|---------------:|-------------------:|-------------------:|:--------------------|:--------------------|:--------------------|:--------------------|:--------------------|:--------------------|---------:|-------------:|-------------:| +| 0 | meta_cohort | Int64 | 13225 | 9185 | 4040 | 100 | 100 | 100 | 0.0 | 0.0 | 1.0 | 1.0 | 0.0 | 1.0 | 2 | 1 | 1 | +| 1 | meta_case | float64 | 13225 | 9185 | 4040 | 100 | 100 | 100 | 302070451.0 | 302070451.0 | 307936727.0 | 381008918.0 | 381008918.0 | 316147162.0 | 13225 | 9185 | 4040 | +| 2 | meta_patient | float64 | 13225 | 9185 | 4040 | 100 | 100 | 100 | 1000678.0 | 1002903.0 | 1000678.0 | 82731116.0 | 82731116.0 | 81041636.0 | 11243 | 8163 | 3123 | +| 3 | meta_incision | datetime64[ns] | 13225 | 9185 | 4040 | 100 | 100 | 100 | 2014-01-02 08:38:00 | 2014-01-02 08:38:00 | 2014-01-13 08:30:00 | 2022-12-30 09:00:00 | 2022-12-29 08:57:00 | 2022-12-30 09:00:00 | 13052 | 9095 | 4033 | +| 4 | meta_suture | datetime64[ns] | 13225 | 9185 | 4040 | 100 | 100 | 100 | 2014-01-02 13:58:00 | 2014-01-02 13:58:00 | 2014-01-13 11:10:00 | 2022-12-30 11:51:00 | 2022-12-29 15:41:00 | 2022-12-30 11:51:00 | 13177 | 9155 | 4039 | +| 5 | meta_year | Int64 | 13225 | 9185 | 4040 | 100 | 100 | 100 | 0 | 0 | 0 | 8 | 8 | 8 | 9 | 9 | 9 | +| 6 | meta_system | Int64 | 13225 | 9185 | 4040 | 100 | 100 | 100 | 0 | 0 | 0 | 4 | 4 | 4 | 5 | 5 | 5 | +| 7 | meta_age | float64 | 13225 | 9185 | 4040 | 100 | 100 | 100 | 18.0 | 18.0 | 18.0 | 218.0 | 218.0 | 122.0 | 84 | 80 | 83 | +| 8 | meta_urgency | Int64 | 13048 | 9009 | 4039 | 98.66 | 98.08 | 99.98 | 0 | 0 | 0 | 5 | 5 | 5 | 6 | 6 | 6 | +| 9 | meta_resurgery | Int64 | 13225 | 9185 | 4040 | 100 | 100 | 100 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 10 | gender | Int64 | 12573 | 8533 | 4040 | 95.07 | 92.9 | 100 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 11 | age | float64 | 13221 | 9183 | 4038 | 99.97 | 99.98 | 99.95 | 18.0 | 18.0 | 18.0 | 98.0 | 97.0 | 98.0 | 81 | 78 | 81 | +| 12 | height | float64 | 10469 | 7397 | 3072 | 79.16 | 80.53 | 76.04 | 117.0 | 131.0 | 117.0 | 250.0 | 250.0 | 204.0 | 82 | 78 | 59 | +| 13 | weight | float64 | 10496 | 7421 | 3075 | 79.36 | 80.79 | 76.11 | 30.0 | 30.0 | 30.0 | 230.0 | 230.0 | 216.0 | 231 | 216 | 121 | +| 14 | bmi | float64 | 10468 | 7399 | 3069 | 79.15 | 80.56 | 75.97 | 10.4 | 10.4 | 10.6 | 69.7 | 69.7 | 68.2 | 485 | 474 | 282 | +| 15 | asa | Int64 | 10785 | 7697 | 3088 | 81.55 | 83.8 | 76.44 | 0 | 0 | 0 | 4 | 4 | 4 | 5 | 5 | 5 | +| 16 | ecog | Int64 | 8533 | 5696 | 2837 | 64.52 | 62.01 | 70.22 | 0 | 0 | 0 | 2 | 2 | 2 | 3 | 3 | 3 | +| 17 | cci | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 17 | 17 | 16 | 18 | 18 | 17 | +| 18 | myocardial_infarction | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 19 | congestive_heart_failure | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 20 | peripheral_vascular_disease | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 21 | cerebrovascular_disease | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 22 | dementia | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 23 | chronic_pulmonary_disease | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 24 | rheumatic_disease | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 25 | peptic_ulcer_disease | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 26 | liver_disease_mild | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 27 | liver_disease_moderate_to_severe | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 28 | diabetes_without_chronic_complications | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 29 | diabetes_with_chronic_complications | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 30 | hemiplegia_or_paraplegia | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 31 | renal_disease | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 32 | malignancy | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 33 | metastatic_solid_tumor | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 34 | aids | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 35 | cardiac_arythmia | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 36 | valvular_disease | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 37 | pulmonary_circulatory_disorder | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 38 | arterial_hypertension | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 39 | other_neurological_disorders | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 40 | hypothyroidism | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 41 | coagulopathy | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 42 | obesity | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 43 | weight_loss | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 44 | fluid_and_electrolyte_disorders | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 45 | blood_loss_anemia | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 46 | deficiency_anemia | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 47 | alcohol_abuse | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 48 | drug_abuse | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 49 | psychoses | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 50 | depression | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 51 | coronary_heart_disease | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 52 | chronic_pancreatitis | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 53 | ulcerative_colitis | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 54 | crohns_disease | Int64 | 12181 | 9026 | 3155 | 92.11 | 98.27 | 78.09 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 55 | primary_system | object | 13225 | 9185 | 4040 | 100 | 100 | 100 | esophagus | esophagus | esophagus | stomach | stomach | stomach | 5 | 5 | 5 | +| 56 | system_esophagus | Int64 | 10534 | 6645 | 3889 | 79.65 | 72.35 | 96.26 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 57 | system_stomach | Int64 | 10534 | 6645 | 3889 | 79.65 | 72.35 | 96.26 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 58 | system_intestine | Int64 | 10534 | 6645 | 3889 | 79.65 | 72.35 | 96.26 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 59 | system_liver | Int64 | 10534 | 6645 | 3889 | 79.65 | 72.35 | 96.26 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 60 | system_pancreas | Int64 | 10534 | 6645 | 3889 | 79.65 | 72.35 | 96.26 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 61 | system_count | float64 | 10534 | 6645 | 3889 | 79.65 | 72.35 | 96.26 | 1.0 | 1.0 | 1.0 | 4.0 | 4.0 | 3.0 | 4 | 4 | 3 | +| 62 | urgency | Int64 | 13048 | 9009 | 4039 | 98.66 | 98.08 | 99.98 | 0 | 0 | 0 | 5 | 5 | 5 | 6 | 6 | 6 | +| 63 | resurgery | Int64 | 13225 | 9185 | 4040 | 100 | 100 | 100 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 64 | month | Int64 | 13225 | 9185 | 4040 | 100 | 100 | 100 | 0 | 0 | 0 | 11 | 11 | 11 | 12 | 12 | 12 | +| 65 | weekday | Int64 | 13225 | 9185 | 4040 | 100 | 100 | 100 | 0 | 0 | 0 | 6 | 6 | 6 | 7 | 7 | 7 | +| 66 | daytime | Int64 | 13225 | 9185 | 4040 | 100 | 100 | 100 | 0 | 0 | 0 | 2 | 2 | 2 | 3 | 3 | 3 | +| 67 | hour | Int64 | 13225 | 9185 | 4040 | 100 | 100 | 100 | 0 | 0 | 0 | 23 | 23 | 23 | 24 | 24 | 24 | +| 68 | mean_monthly_temperature | float64 | 13225 | 9185 | 4040 | 100 | 100 | 100 | -0.8 | -0.8 | -0.8 | 22.8 | 22.8 | 22.8 | 87 | 87 | 87 | +| 69 | minimum_monthly_temperature | float64 | 13225 | 9185 | 4040 | 100 | 100 | 100 | -13.7 | -13.7 | -13.7 | 14.0 | 14.0 | 14.0 | 87 | 87 | 87 | +| 70 | maximum_monthly_temperature | float64 | 13225 | 9185 | 4040 | 100 | 100 | 100 | 6.7 | 6.7 | 6.7 | 38.5 | 38.5 | 38.5 | 94 | 94 | 94 | +| 71 | monthly_precipitation | float64 | 13225 | 9185 | 4040 | 100 | 100 | 100 | 1.3 | 1.3 | 1.3 | 193.4 | 193.4 | 193.4 | 97 | 97 | 97 | +| 72 | monthly_sunshine_hours | float64 | 12967 | 9007 | 3960 | 98.05 | 98.06 | 98.02 | 14.1 | 14.1 | 14.1 | 340.4 | 340.4 | 340.4 | 103 | 103 | 103 | +| 73 | sodium | float64 | 12742 | 8772 | 3970 | 96.35 | 95.5 | 98.27 | 102.0 | 102.0 | 120.0 | 169.0 | 169.0 | 158.0 | 43 | 41 | 37 | +| 74 | potassium | float64 | 13024 | 9052 | 3972 | 98.48 | 98.55 | 98.32 | 1.4 | 1.4 | 1.6 | 8.4 | 8.4 | 8.2 | 57 | 53 | 44 | +| 75 | bilirubin | float64 | 8224 | 7079 | 1145 | 62.19 | 77.07 | 28.34 | 0.0 | 0.0 | 0.0 | 27.2 | 27.2 | 26.1 | 522 | 503 | 139 | +| 76 | urea | float64 | 9503 | 6981 | 2522 | 71.86 | 76 | 62.43 | 3.0 | 3.0 | 4.0 | 335.0 | 252.0 | 335.0 | 201 | 184 | 161 | +| 77 | ggt | float64 | 9880 | 7157 | 2723 | 74.71 | 77.92 | 67.4 | 3.0 | 3.0 | 5.0 | 4679.0 | 4679.0 | 2948.0 | 856 | 786 | 428 | +| 78 | lipase | float64 | 8755 | 6645 | 2110 | 66.2 | 72.35 | 52.23 | 3.0 | 3.0 | 4.0 | 5790.0 | 4781.0 | 5790.0 | 376 | 348 | 196 | +| 79 | crp | float64 | 8715 | 6457 | 2258 | 65.9 | 70.3 | 55.89 | 0.3 | 0.3 | 0.3 | 661.4 | 661.4 | 658.6 | 1762 | 1337 | 1003 | +| 80 | hemoglobin | float64 | 13076 | 9096 | 3980 | 98.87 | 99.03 | 98.51 | 3.3 | 3.3 | 4.5 | 39.9 | 37.0 | 39.9 | 155 | 147 | 130 | +| 81 | wbc | float64 | 13029 | 9074 | 3955 | 98.52 | 98.79 | 97.9 | 0.09 | 0.11 | 0.09 | 73.18 | 73.18 | 61.08 | 1858 | 1588 | 1311 | +| 82 | platelets | float64 | 13030 | 9075 | 3955 | 98.53 | 98.8 | 97.9 | 3.8 | 3.8 | 6.4 | 1240.0 | 1119.0 | 1240.0 | 717 | 653 | 573 | +| 83 | hematocrit | float64 | 6593 | 5122 | 1471 | 49.85 | 55.76 | 36.41 | 0.102 | 0.107 | 0.102 | 0.652 | 0.652 | 0.643 | 441 | 412 | 337 | +| 84 | inr | float64 | 12976 | 9032 | 3944 | 98.12 | 98.33 | 97.62 | 0.8 | 0.8 | 0.8 | 8.0 | 8.0 | 8.0 | 185 | 157 | 132 | +| 85 | aptt | float64 | 10916 | 7632 | 3284 | 82.54 | 83.09 | 81.29 | 21.0 | 21.0 | 21.0 | 240.0 | 240.0 | 240.0 | 443 | 396 | 314 | +| 86 | erythrocytes | float64 | 13031 | 9076 | 3955 | 98.53 | 98.81 | 97.9 | 0.6 | 0.6 | 1.1 | 7.4 | 7.4 | 7.2 | 61 | 57 | 54 | +| 87 | creatinine | float64 | 9834 | 8505 | 1329 | 74.36 | 92.6 | 32.9 | 0.12 | 0.12 | 0.25 | 16.5 | 16.5 | 11.87 | 418 | 392 | 202 | +| 88 | glucose | float64 | 7152 | 5269 | 1883 | 54.08 | 57.37 | 46.61 | 1.0 | 1.0 | 38.0 | 752.0 | 722.0 | 752.0 | 309 | 286 | 218 | +| 89 | target_30_day_mortality | Int64 | 10941 | 7541 | 3400 | 82.73 | 82.1 | 84.16 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 90 | target_90_day_mortality | Int64 | 9879 | 6760 | 3119 | 74.7 | 73.6 | 77.2 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 91 | target_deceased_after_discharge | Int64 | 12525 | 8758 | 3767 | 94.71 | 95.35 | 93.24 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 92 | target_clavien_dindo_5 | Int64 | 13225 | 9185 | 4040 | 100 | 100 | 100 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 93 | target_acute_myocardial_infarction | Int64 | 12245 | 9052 | 3193 | 92.59 | 98.55 | 79.03 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 94 | target_pulmonary_embolism | Int64 | 12245 | 9052 | 3193 | 92.59 | 98.55 | 79.03 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 95 | target_septic_shock | Int64 | 12245 | 9052 | 3193 | 92.59 | 98.55 | 79.03 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 96 | target_pneumonia | Int64 | 12245 | 9052 | 3193 | 92.59 | 98.55 | 79.03 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 97 | target_liver_failure | Int64 | 12245 | 9052 | 3193 | 92.59 | 98.55 | 79.03 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 98 | target_cardiogenic_shock | Int64 | 12245 | 9052 | 3193 | 92.59 | 98.55 | 79.03 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 99 | target_acute_pancreatitis | Int64 | 12245 | 9052 | 3193 | 92.59 | 98.55 | 79.03 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | +| 100 | target_acute_respiratory_failure | Int64 | 12245 | 9052 | 3193 | 92.59 | 98.55 | 79.03 | 0 | 0 | 0 | 1 | 1 | 1 | 2 | 2 | 2 | ## Additional Notes diff --git a/1_intra/logic.ipynb b/1_intra/logic.ipynb deleted file mode 100644 index f4dca67..0000000 --- a/1_intra/logic.ipynb +++ /dev/null @@ -1,583 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Import" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from tqdm.notebook import tqdm" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Main" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "surgery_data: pd.DataFrame = pd.DataFrame(columns=['case', 'timestamp', 'type', 'value'])" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Numeric" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Vitals" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Load" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "file: pd.ExcelFile = pd.ExcelFile('6_1_raw/Studie 2023-01-OP-An„sthesie-Vitalparameter.xlsx')\n", - "sheetNames: list[str] = file.sheet_names\n", - "sheetNames" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sheets: list[pd.DataFrame] = []\n", - "for name in tqdm(sheetNames):\n", - " sheets.append(pd.read_excel(file, name))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data: pd.DataFrame = pd.concat(sheets, ignore_index=True)\n", - "data" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Process" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# rename columns.\n", - "data.rename(columns={\n", - " 'Fallnummer': 'case',\n", - " 'MesswerteDatum': 'timestamp',\n", - " 'MesswerteTyp': 'type',\n", - " 'Messwert': 'value',\n", - " 'MesswertEinheit': 'unit'\n", - "}, inplace=True)\n", - "\n", - "# drop unit column.\n", - "data.drop(columns=['unit'], inplace=True)\n", - "\n", - "# convert to datetime.\n", - "data['timestamp'] = pd.to_datetime(data['timestamp'], format='%Y%m%d%H%M%S')\n", - "\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "len(data.groupby('type').count())" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "groups = data.groupby('type')\n", - "\n", - "# loop through groups.\n", - "for name, group in tqdm(groups):\n", - "\n", - " new_name: str = name\n", - " if name == 'Blutdruck Diastolisch': new_name = 'bp_dia'\n", - " elif name == 'Blutdruck Systolisch': new_name = 'bp_sys'\n", - " elif name == 'PAT_ANAE_SEDLINE': new_name = 'sedline'\n", - " elif name == 'Puls': new_name = 'hr'\n", - " elif name == 'beat_mess_AMV': new_name = 'rmv'\n", - " elif name == 'beat_mess_FiO2': new_name = 'fio2'\n", - " elif name == 'beat_mess_Frequenz_AF': new_name = 'rr'\n", - " elif name == 'beat_mess_IntrPEEP': new_name = 'vent_peep'\n", - " elif name == 'beat_mess_Kapnometrie_etCO2': new_name = 'capno_et_co2'\n", - " elif name == 'beat_mess_Spitzendruck_Ppeak': new_name = 'vent_p_peak'\n", - " # elif name == 'beat_mess_exp_Des': new_name = 'exp_des'\n", - " elif name == 'beat_mess_exp_Lachgas': new_name = 'exp_no'\n", - " elif name == 'beat_mess_exp_Sevo': new_name = 'exp_sevo'\n", - " # elif name == 'beat_mess_pulmon_compl': new_name = 'pulmon_compl'\n", - " elif name == 'vital_AF': new_name = 'rr'\n", - " elif name == 'vital_HF': new_name = 'hr'\n", - " elif name == 'vital_SaO2': new_name = 'sao2'\n", - " elif name == 'vital_T_K': new_name = 'temp'\n", - " elif name == 'vital_T_K2': new_name = 'temp'\n", - " elif name == 'vital_ZVD': new_name = 'cvd'\n", - " else: continue\n", - "\n", - " group['type'] = new_name\n", - " surgery_data: pd.DataFrame = pd.concat([surgery_data, group], ignore_index=True)\n", - "\n", - "surgery_data" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## GCS" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Load" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "file: pd.ExcelFile = pd.ExcelFile('6_1_raw/Studie-2023-11-OP-An„sthesie-GCS.xlsx')\n", - "sheetNames: list[str] = file.sheet_names\n", - "sheetNames" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sheets: list[pd.DataFrame] = []\n", - "for name in tqdm(sheetNames):\n", - " sheets.append(pd.read_excel(file, name))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data: pd.DataFrame = pd.concat(sheets, ignore_index=True)\n", - "data" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Process" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# rename columns.\n", - "data.rename(columns={\n", - " 'Fallnummer': 'case',\n", - " 'Datum': 'timestamp',\n", - " 'Score': 'value',\n", - "}, inplace=True)\n", - "\n", - "# add type columns.\n", - "data['type'] = 'gcs'\n", - "\n", - "# convert to datetime.\n", - "data['timestamp'] = pd.to_datetime(data['timestamp'], format='%Y-%m-%d %H:%M:%S.%f')\n", - "\n", - "data" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# surgery_data: pd.DataFrame = pd.concat([surgery_data, data], ignore_index=True)\n", - "# surgery_data" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Arterial Blood Gas" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Load" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "file: pd.ExcelFile = pd.ExcelFile('6_1_raw/Studie-2023-13-OP-An„sthesie-BGA.xlsx')\n", - "sheetNames: list[str] = file.sheet_names\n", - "sheetNames" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sheets: list[pd.DataFrame] = []\n", - "for name in tqdm(sheetNames):\n", - " sheets.append(pd.read_excel(file, name))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data: pd.DataFrame = pd.concat(sheets, ignore_index=True)\n", - "data" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Process" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# rename columns.\n", - "data.rename(columns={\n", - " 'FALLNR': 'case',\n", - " 'LaborName': 'type',\n", - " 'Zeitpunkt': 'timestamp',\n", - " 'Wert': 'value',\n", - " 'Unit': 'unit'\n", - "}, inplace=True)\n", - "\n", - "# drop unit column.\n", - "data.drop(columns=['unit'], inplace=True)\n", - "\n", - "# convert to datetime.\n", - "data['timestamp'] = pd.to_datetime(data['timestamp'], format='%Y-%m-%d %H:%M:%S.%f')\n", - "\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "len(data.groupby('type').count())" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "groups = data.groupby('type')\n", - "\n", - "# loop through groups.\n", - "for name, group in tqdm(groups):\n", - "\n", - " new_name: str = name\n", - " if name == 'ABE': new_name = 'abg_abe'\n", - " elif name == 'COHb': new_name = 'abg_cohb'\n", - " # elif name == 'Ca(7.4)': new_name = 'abg_ca_7_4'\n", - " elif name == 'Ca++': new_name = 'abg_ca'\n", - " elif name == 'Cl-': new_name = 'abg_cl'\n", - " elif name == 'FIO2': new_name = 'abg_fio2'\n", - " elif name == 'Glu': new_name = 'abg_glu'\n", - " elif name == 'Hct': new_name = 'abg_hct'\n", - " elif name == 'K+': new_name = 'abg_k'\n", - " elif name == 'Lac': new_name = 'abg_lac'\n", - " elif name == 'MetHb': new_name = 'abg_methb'\n", - " elif name == 'Na+': new_name = 'abg_na'\n", - " elif name == 'O2Hb': new_name = 'abg_o2hb'\n", - " elif name == 'SBE': new_name = 'abg_sbe'\n", - " elif name == 'T': new_name = 'abg_t'\n", - " elif name == 'pCO2': new_name = 'abg_pco2'\n", - " # elif name == 'pCO2(T)': new_name = 'abg_pco2_t'\n", - " elif name == 'pH': new_name = 'abg_ph'\n", - " # elif name == 'pH(T)': new_name = 'abg_ph_t'\n", - " elif name == 'pO2': new_name = 'abg_po2'\n", - " # elif name == 'pO2(T)': new_name = 'abg_po2_t'\n", - " elif name == 'sO2': new_name = 'abg_so2'\n", - " elif name == 'tHb': new_name = 'abg_thb'\n", - " else: continue\n", - "\n", - " group['type'] = new_name\n", - " surgery_data: pd.DataFrame = pd.concat([surgery_data, group], ignore_index=True)\n", - "\n", - "surgery_data" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## DDS" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Load" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "file: pd.ExcelFile = pd.ExcelFile('6_1_raw/Studie-2023-12-OP-An„sthesie-Deliriun Detection Score.xlsx')\n", - "sheetNames: list[str] = file.sheet_names\n", - "sheetNames" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "sheets: list[pd.DataFrame] = []\n", - "for name in tqdm(sheetNames):\n", - " sheets.append(pd.read_excel(file, name))" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data: pd.DataFrame = pd.concat(sheets, ignore_index=True)\n", - "data" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Process" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "surgery_data['value'] = surgery_data['value'].apply(pd.to_numeric, errors='coerce')\n", - "surgery_data['value'] = surgery_data['value'].astype(float)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# case_id\n", - "surgery_data['case'] = surgery_data['case'].astype(int)\n", - "\n", - "# event_ts\n", - "surgery_data['timestamp'] = pd.to_datetime(surgery_data['timestamp'])\n", - "\n", - "#type\n", - "surgery_data['type'] = surgery_data['type'].astype(str)\n", - "\n", - "# drop nan\n", - "surgery_data.dropna(inplace=True)\n", - "\n", - "# drop duplicates\n", - "surgery_data.drop_duplicates(keep = 'first', inplace = True)\n", - "\n", - "# sort by event_ts\n", - "surgery_data.sort_values(by=['timestamp'], inplace=True)\n", - "\n", - "surgery_data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# import \n", - "included_cases = pd.read_csv('../4_cases/4_3_clean.csv')\n", - "\n", - "# drop all masterData rows that are not in included_cases and print the number of deleted rows\n", - "print('Length of masterData before: ' + str(len(surgery_data)))\n", - "print('Number of cases before: ' + str(surgery_data['case'].nunique()))\n", - "surgery_data = surgery_data[surgery_data['case'].isin(included_cases['case'])]\n", - "print('Length of masterData after: ' + str(len(surgery_data)))\n", - "print('Number of cases after: ' + str(surgery_data['case'].nunique()))\n", - "\n", - "surgery_data" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "surgery_data.to_csv('6_3_clean.csv', index=False)\n", - "surgery_data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "types = surgery_data.groupby('type').size()\n", - "types.to_csv(f'6_4_types.csv', index=False)\n", - "types" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/2_post/logic.ipynb b/2_post/logic.ipynb deleted file mode 100644 index 1cc6dec..0000000 --- a/2_post/logic.ipynb +++ /dev/null @@ -1,1379 +0,0 @@ -{ - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Import" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "from tqdm.notebook import tqdm" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Logic" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "stays: list[pd.DataFrame] = []\n", - "icu_data: pd.DataFrame = pd.DataFrame(columns=['case', 'timestamp', 'type', 'value'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def load(file_name: str, path: str = '7_1_raw_copra_5/', columns: list[str] = ['case', 'ward', 'admission_ts', 'discharge_ts', 'type', 'parameter', 'value', 'timestamp']) -> pd.DataFrame:\n", - " # load the data from the txt file.\n", - " data = pd.read_csv(path + file_name + '.txt', encoding = 'ISO-8859-1', sep = '|', header = None)\n", - "\n", - " # remove the first column since it does not contain any useful information.\n", - " data.drop(columns = data.columns[0], axis=1, inplace=True)\n", - "\n", - " # rename the columns to the given columns.\n", - " data.columns = columns\n", - "\n", - " # extract the stays from the data.\n", - " new_stays = data[['case', 'ward', 'admission_ts', 'discharge_ts']]\n", - " data.drop(['ward', 'admission_ts', 'discharge_ts'], axis = 1, inplace = True)\n", - " stays.append(new_stays)\n", - "\n", - " # return the data.\n", - " return data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Copra 5" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Numeric" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### arterial_oxygen_saturation" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data = load('arterial_oxygen_saturation')\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.drop(['type', 'parameter'], axis=1, inplace=True)\n", - "data['type'] = 'sao2'\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "icu_data: pd.DataFrame = pd.concat([icu_data, data], ignore_index=True)\n", - "icu_data" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### blood_gas_analysis" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "columns = ['case', 'ward', 'admission_ts', 'discharge_ts', 'type', 'parameter', 'timestamp', 'value', 'value_2']\n", - "data = load('blood_gas_analysis', columns = columns)\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.drop(['type', 'value_2'], axis = 1, inplace = True)\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "groups = data.groupby('parameter')\n", - "for name, group in groups:\n", - " group.drop(['parameter'], axis = 1, inplace = True)\n", - "\n", - " new_name: str = 'abg_'\n", - " if name == 'Calcium_ion': new_name += 'ca'\n", - " elif name == 'Glucose': new_name += 'glu'\n", - " elif name == 'HCO3': new_name += 'hco3'\n", - " elif name == 'Hb': new_name += 'thb'\n", - " elif name == 'Hkt': new_name += 'hct'\n", - " elif name == 'Kalium': new_name += 'k'\n", - " elif name == 'Laktat': new_name += 'lac'\n", - " elif name == 'Natrium': new_name += 'na'\n", - " elif name == 'SBE': new_name += 'sbe'\n", - " elif name == 'pCO2': new_name += 'pco2'\n", - " elif name == 'pH': new_name += 'ph'\n", - " elif name == 'pO2': new_name += 'po2'\n", - " elif name == 'sO2': new_name += 'so2'\n", - " else: continue\n", - "\n", - " group['type'] = new_name\n", - "\n", - " icu_data: pd.DataFrame = pd.concat([icu_data, group], ignore_index=True)\n", - "\n", - "icu_data" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### blood_pressure" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data = load('blood_pressure')\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.drop(['type'], axis = 1, inplace = True)\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "groups = data.groupby('parameter')\n", - "for name, group in groups:\n", - " group.drop(['parameter'], axis = 1, inplace = True)\n", - "\n", - " new_name: str = 'bp_'\n", - " if name == 'RRs': new_name += 'sys'\n", - " elif name == 'RRd': new_name += 'dia'\n", - " else: continue\n", - " \n", - " group['type'] = new_name\n", - "\n", - " icu_data: pd.DataFrame = pd.concat([icu_data, group], ignore_index=True)\n", - "\n", - "icu_data" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### central_venous_pressure" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data = load('central_venous_pressure')\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.drop(['type', 'parameter'], axis = 1, inplace = True)\n", - "data['type'] = 'cvp'\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "icu_data: pd.DataFrame = pd.concat([icu_data, data], ignore_index=True)\n", - "icu_data" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### drainage" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data = load('drainage')\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.drop(['parameter'], axis = 1, inplace = True)\n", - "data['type'] = 'drainage'\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "icu_data: pd.DataFrame = pd.concat([icu_data, data], ignore_index=True)\n", - "icu_data" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### fluid_intake" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data = load('fluid_intake')\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.drop(['parameter'], axis = 1, inplace = True)\n", - "data['type'] = 'fluid_in'\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "icu_data: pd.DataFrame = pd.concat([icu_data, data], ignore_index=True)\n", - "icu_data" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### fluid_output" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data = load('fluid_output')\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.drop(['parameter'], axis = 1, inplace = True)\n", - "data['type'] = 'fluid_out'\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "icu_data: pd.DataFrame = pd.concat([icu_data, data], ignore_index=True)\n", - "icu_data" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### heart_rate" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data = load('heart_rate')\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.drop(['type', 'parameter'], axis = 1, inplace = True)\n", - "data['type'] = 'hr'\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "icu_data: pd.DataFrame = pd.concat([icu_data, data], ignore_index=True)\n", - "icu_data" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### invasive_blood_pressure" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data = load('invasive_blood_pressure')\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.drop(['parameter'], axis = 1, inplace = True)\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "groups = data.groupby('type')\n", - "for name, group in groups:\n", - " new_name = 'bp_'\n", - " if name == 'P_sa_i': new_name += 'sys'\n", - " elif name == 'P_da_i': new_name += 'dia'\n", - " elif name == 'P_ma_i': new_name = 'map'\n", - " else: continue\n", - " \n", - " group['type'] = new_name\n", - "\n", - " icu_data: pd.DataFrame = pd.concat([icu_data, group], ignore_index=True)\n", - "\n", - "icu_data" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### pulmonary_artery_pressure\n" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data = load('pulmonary_artery_pressure')\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.drop(['parameter'], axis = 1, inplace = True)\n", - "data['type'] = 'pap'\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "icu_data: pd.DataFrame = pd.concat([icu_data, data], ignore_index=True)\n", - "icu_data" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### respiratory_rate" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data = load('respiratory_rate')\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.drop(['type', 'parameter'], axis = 1, inplace = True)\n", - "data['type'] = 'rr'\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "icu_data: pd.DataFrame = pd.concat([icu_data, data], ignore_index=True)\n", - "icu_data" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### score" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "columns = ['case', 'ward', 'admission_ts', 'discharge_ts', 'type', 'parameter', 'value', 'value_2', 'value_3', 'value_4', 'timestamp']\n", - "data = load('score', columns=columns)\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.drop(['parameter', 'value_2', 'value_3', 'value_4'], axis=1, inplace=True)\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "groups = data.groupby('type')\n", - "for name, group in groups:\n", - " new_name: str = ''\n", - " if name == 'DDS': new_name = 'dds'\n", - " elif name == 'Glasgow_Coma_Scale': new_name = 'gcs'\n", - " elif name == 'SAPS2': new_name = 'saps2'\n", - " elif name == 'SOFA': new_name = 'sofa'\n", - " elif name == 'TISS-28': new_name = 'tiss28'\n", - " else: continue\n", - "\n", - " group['type'] = new_name\n", - "\n", - " icu_data: pd.DataFrame = pd.concat([icu_data, group], ignore_index=True)\n", - "\n", - "icu_data" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### temperature" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data = load('temperature')\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.drop(['parameter'], axis = 1, inplace = True)\n", - "data['type'] = 't'\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "icu_data: pd.DataFrame = pd.concat([icu_data, data], ignore_index=True)\n", - "icu_data" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Natural Language" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "load('abdomen')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "load('care_report')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "load('central_nervous_system')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "load('handover')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "load('heart')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "load('kidney')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "load('medical_report')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "load('order')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "load('skin')" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "load('stool')" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Events" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "load('activity')" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Other" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "columns = ['case', 'ward', 'admission_ts', 'discharge_ts', 'type', 'parameter', 'value', 'value_2', 'value_3', 'timestamp']\n", - "load('dialysis', columns=columns)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "columns = ['case', 'ward', 'admission_ts', 'discharge_ts', 'type', 'parameter', 'value', 'value_2', 'value_3', 'value_4', 'timestamp']\n", - "load('ventilation_dialysis_bar', columns=columns)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Stays" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# concat all stays to a dataframe.\n", - "stays = pd.concat(stays, axis = 0)\n", - "\n", - "# remove all rows where at least one value is nan.\n", - "stays.dropna(inplace = True)\n", - "\n", - "# remove all rows where at least one value is '\\N'.\n", - "for column in stays.columns:\n", - " stays = stays[stays[column] != r'\\N']\n", - "\n", - "# convert columns to appropriate types.\n", - "stays['case'] = stays['case'].astype(int)\n", - "stays['ward'] = stays['ward'].astype(str)\n", - "stays['admission_ts'] = pd.to_datetime(stays['admission_ts'])\n", - "stays['discharge_ts'] = pd.to_datetime(stays['discharge_ts'])\n", - "\n", - "# drop all rows where case_id is 0.\n", - "stays = stays[stays['case'] != 0]\n", - "\n", - "# drop duplicates.\n", - "stays.drop_duplicates(inplace = True)\n", - "\n", - "# sort by 'admission_ts'.\n", - "stays.sort_values(by = 'admission_ts', inplace = True)\n", - "\n", - "# save.\n", - "stays.to_csv(f'7_4_stays.csv', index = False)\n", - "stays" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Copra 6" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Vitals" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Load" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "file: pd.ExcelFile = pd.ExcelFile('7_2_raw_copra_6/Studie-2023-02-Vitalparameter.xlsx')\n", - "sheetNames: list[str] = file.sheet_names\n", - "\n", - "sheets: list[pd.DataFrame] = []\n", - "for name in tqdm(sheetNames):\n", - " sheets.append(pd.read_excel(file, name))\n", - "\n", - "data: pd.DataFrame = pd.concat(sheets, ignore_index=True)\n", - "data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Process" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.drop(columns=['MesswertEinheit'], inplace=True)\n", - "\n", - "data.rename(columns={\n", - " 'Fallnummer': 'case',\n", - " 'MesswerteDatum': 'timestamp',\n", - " 'MesswerteTyp': 'type',\n", - " 'Messwert': 'value'\n", - "}, inplace=True)\n", - "\n", - "data['timestamp'] = pd.to_datetime(data['timestamp'], format='%Y%m%d%H%M%S')\n", - "\n", - "data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "groups = data.groupby('type')\n", - "\n", - "for name, group in tqdm(groups):\n", - " new_name: str = name\n", - "\n", - " if name == 'Blutdruck Diastolisch': new_name = 'bp_dia'\n", - " elif name == 'Blutdruck Systolisch': new_name = 'bp_sys'\n", - " elif name == 'Puls': new_name = 'hr'\n", - " elif name == 'vital_AF': new_name = 'rr'\n", - " elif name == 'vital_HF': new_name = 'hr'\n", - " elif name == 'vital_SaO2': new_name = 'sao2'\n", - " elif name == 'vital_SaO2_2': new_name = 'sao2'\n", - " elif name == 'vital_T_K': new_name = 't'\n", - " elif name == 'vital_T_K2': new_name = 't'\n", - " elif name == 'vital_ZVD': new_name = 'cvp'\n", - " else: continue\n", - "\n", - " group['type'] = new_name\n", - " icu_data: pd.DataFrame = pd.concat([icu_data, group], ignore_index=True)\n", - "\n", - "icu_data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## ABG" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Load" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "file: pd.ExcelFile = pd.ExcelFile('7_2_raw_copra_6/Studie-2023-13-BGAs.xlsx')\n", - "sheetNames: list[str] = file.sheet_names\n", - "\n", - "sheets: list[pd.DataFrame] = []\n", - "for name in tqdm(sheetNames):\n", - " sheets.append(pd.read_excel(file, name))\n", - "\n", - "data: pd.DataFrame = pd.concat(sheets, ignore_index=True)\n", - "data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Process" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.drop(columns=['Einheit'], inplace=True)\n", - "\n", - "data.rename(columns={\n", - " 'Fallnummer': 'case',\n", - " 'Zeitpunkt': 'timestamp',\n", - " 'Parameter': 'type',\n", - " 'Wert': 'value'\n", - "}, inplace=True)\n", - "\n", - "data['timestamp'] = pd.to_datetime(data['timestamp'], format='%Y-%m-%d %H:%M:%S.%f')\n", - "\n", - "data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "groups = data.groupby('type')\n", - "\n", - "for name, group in tqdm(groups):\n", - " new_name: str = name\n", - "\n", - " new_name: str = 'abg_'\n", - " if name == 'ABE': new_name += 'abe'\n", - " elif name == 'COHb': new_name += 'cohb'\n", - " elif name == 'Ca++': new_name += 'ca'\n", - " elif name == 'Cl-': new_name += 'cl'\n", - " elif name == 'FIO2': new_name += 'fio2'\n", - " elif name == 'Glu': new_name += 'glu'\n", - " elif name == 'HCO3': new_name += 'hco3'\n", - " elif name == 'Hct': new_name += 'hct'\n", - " elif name == 'K+': new_name += 'k'\n", - " elif name == 'Lac': new_name += 'lac'\n", - " elif name == 'MetHb': new_name += 'methb'\n", - " elif name == 'Na+': new_name += 'na'\n", - " elif name == 'O2Hb': new_name += 'o2hb'\n", - " elif name == 'RHb': new_name += 'rhb' # reduced hemoglobin\n", - " elif name == 'SBE': new_name += 'sbe'\n", - " elif name == 'T': new_name += 't'\n", - " elif name == 'pCO2': new_name += 'pco2'\n", - " elif name == 'pH': new_name += 'ph'\n", - " elif name == 'pO2': new_name += 'po2'\n", - " elif name == 'sO2': new_name += 'so2'\n", - " elif name == 'tHb': new_name += 'thb'\n", - " else: continue\n", - "\n", - " group['type'] = new_name\n", - " icu_data: pd.DataFrame = pd.concat([icu_data, group], ignore_index=True)\n", - "\n", - "icu_data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## GCS" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Load" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "file: pd.ExcelFile = pd.ExcelFile('7_2_raw_copra_6/Studie-2023-11-GCS.xlsx')\n", - "sheetNames: list[str] = file.sheet_names\n", - "\n", - "sheets: list[pd.DataFrame] = []\n", - "for name in tqdm(sheetNames):\n", - " sheets.append(pd.read_excel(file, name))\n", - "\n", - "data: pd.DataFrame = pd.concat(sheets, ignore_index=True)\n", - "data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Process" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.rename(columns={\n", - " 'Fallnummer': 'case',\n", - " 'Datum': 'timestamp',\n", - " 'Score': 'value'\n", - "}, inplace=True)\n", - "\n", - "data['type'] = 'gcs'\n", - "\n", - "data['timestamp'] = pd.to_datetime(data['timestamp'], format='%Y-%m-%d %H:%M:%S.%f')\n", - "\n", - "data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "icu_data: pd.DataFrame = pd.concat([icu_data, data], ignore_index=True)\n", - "icu_data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## DDS" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Load" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "file: pd.ExcelFile = pd.ExcelFile('7_2_raw_copra_6/Studie-2023-12-Deliriun Detection Score.xlsx')\n", - "sheetNames: list[str] = file.sheet_names\n", - "\n", - "sheets: list[pd.DataFrame] = []\n", - "for name in tqdm(sheetNames):\n", - " sheets.append(pd.read_excel(file, name))\n", - "\n", - "data: pd.DataFrame = pd.concat(sheets, ignore_index=True)\n", - "data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Process" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.rename(columns={\n", - " 'Fallnummer': 'case',\n", - " 'Datum': 'timestamp',\n", - " 'DDS': 'value'\n", - "}, inplace=True)\n", - "\n", - "data['type'] = 'dds'\n", - "\n", - "data['timestamp'] = pd.to_datetime(data['timestamp'], format='%Y-%m-%d %H:%M:%S.%f')\n", - "\n", - "data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "icu_data: pd.DataFrame = pd.concat([icu_data, data], ignore_index=True)\n", - "icu_data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## TODO: SAPSII & TISS10" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Load" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "file: pd.ExcelFile = pd.ExcelFile('7_2_raw_copra_6/Studie-2023-07-Scores-TISS-SAPSII.xlsx')\n", - "sheetNames: list[str] = file.sheet_names\n", - "\n", - "sheets: list[pd.DataFrame] = []\n", - "for name in tqdm(sheetNames):\n", - " sheets.append(pd.read_excel(file, name))\n", - "\n", - "data: pd.DataFrame = pd.concat(sheets, ignore_index=True)\n", - "data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data.columns" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data['SAPS2_VKrank']" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Process" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "icu_data['value'] = icu_data['value'].apply(pd.to_numeric, errors='coerce')\n", - "icu_data['value'] = icu_data['value'].astype(float)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "icu_data['case'] = icu_data['case'].apply(pd.to_numeric, errors='coerce')\n", - "\n", - "# drop nan\n", - "icu_data.dropna(inplace=True)\n", - "\n", - "# convert\n", - "icu_data['case'] = icu_data['case'].astype(int)\n", - "icu_data['timestamp'] = pd.to_datetime(icu_data['timestamp'])\n", - "icu_data['type'] = icu_data['type'].astype(str)\n", - "\n", - "# drop all rows where case_id is 0.\n", - "icu_data = icu_data[icu_data['case'] != 0]\n", - "\n", - "# drop duplicates\n", - "icu_data.drop_duplicates(keep = 'first', inplace = True)\n", - "\n", - "# sort by event_ts\n", - "icu_data.sort_values(by=['timestamp'], inplace=True)\n", - "\n", - "icu_data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# import \n", - "included_cases = pd.read_csv('../4_cases/4_3_clean.csv')\n", - "\n", - "# drop all masterData rows that are not in included_cases and print the number of deleted rows\n", - "print('Length of masterData before: ' + str(len(icu_data)))\n", - "print('Number of cases before: ' + str(icu_data['case'].nunique()))\n", - "icu_data = icu_data[icu_data['case'].isin(included_cases['case'])]\n", - "print('Length of masterData after: ' + str(len(icu_data)))\n", - "print('Number of cases after: ' + str(icu_data['case'].nunique()))\n", - "\n", - "icu_data" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "icu_data.to_csv('7_5_clean.csv', index=False)\n", - "icu_data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "types = icu_data.groupby('type').count()\n", - "types.to_csv(f'7_6_types.csv')\n", - "types" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -} diff --git a/3_lab/logic.ipynb b/3_lab/logic.ipynb deleted file mode 100644 index 62fa1cd..0000000 --- a/3_lab/logic.ipynb +++ /dev/null @@ -1,1247 +0,0 @@ -{ - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Import" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "import pandas as pd\n", - "pd.options.mode.chained_assignment = None" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Load" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data = pd.read_csv('8_1_raw.csv', sep=';', low_memory=False)\n", - "data.drop(['n2labor_c_recordid', 'labor001_c_n2leistid'], axis=1, inplace=True)\n", - "data.rename(columns={'labor001_c_n2kattext': 'type', 'c_n2value': 'value', 'c_n2unit': 'unit', 'lab_datetime': 'timestamp', 'Fallnummer': 'case'}, inplace=True)\n", - "data = data[['case', 'timestamp', 'type', 'value', 'unit']]\n", - "data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Check" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "data[data['type'] == '']" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Process" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "def append(name: str, data: pd.DataFrame, lab_data: pd.DataFrame, types: [str], units: [str] = []) -> pd.DataFrame:\n", - " subsets = []\n", - " for type in types:\n", - " type_data = data[data['type'] == type]\n", - " print(f'{type}: {len(type_data)} {type_data[\"unit\"].unique()}')\n", - " subsets.append(type_data)\n", - " \n", - " subset = pd.concat(subsets, ignore_index=True)\n", - " if len(units) > 0: subset = subset[subset['unit'].isin(units)]\n", - "\n", - " subset['type'] = name\n", - " print(f'Total: {len(subset)} {subset[\"unit\"].unique()}')\n", - "\n", - " return pd.concat([lab_data, subset], ignore_index=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "lab_data = pd.DataFrame(columns=['case', 'timestamp', 'type', 'value'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# 25 OH Vitamin D3 = 25 OH Vitamin B3\n", - "lab_data = append('vd25', data, lab_data, ['25-Hydroxy-Vitamin D3', '25-OH-Vitamin D3', '25-OH-Vitamin D3 Se'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Base Excess = Basenüberschuss\n", - "lab_data = append('be', data, lab_data, ['ABE', 'Base Excess', 'Basenüberschuß', 'SBE'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Antithrombin = Antithrombin\n", - "lab_data = append('at', data, lab_data, ['AT3', 'Antithrombin', 'Antithrombin Aktivität'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Albumin = Albumin\n", - "lab_data = append('alb', data, lab_data, ['Albumin', 'Albumin (HP)', 'Albumin HP', 'Albumin Se', 'Albumin i.Se'], ['g/l'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Alkaline Phosphatase = Alkalische Phosphatase\n", - "lab_data = append('alp', data, lab_data, ['Alk. Phosphatase', 'Alk. Phosphatase (HP)', 'Alk.Phosphatase', 'Alk.Phosphatase HP', 'Alk.Phosphatase Se'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Ammonia = Ammoniak\n", - "lab_data = append('nh3', data, lab_data, ['Ammoniak', 'Ammoniak (EDTA)', 'Ammoniak EDTA'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Amylase = Amylase\n", - "lab_data = append('ams', data, lab_data, ['Amylase', 'Amylase HP', 'Amylase Se'], ['U/l'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Basophils Absolute = Basophile Absolut\n", - "lab_data = append('baso', data, lab_data, ['Basophile absolut'])\n", - "\n", - "baso = data[(data['type'] == 'Basophile') & (data['unit'] == '/nl')]\n", - "baso['type'] = 'baso'\n", - "lab_data = pd.concat([lab_data, baso], ignore_index=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Basophils Relative = Basophile Relativ\n", - "lab_data = append('baso_rel', data, lab_data, ['Basophile %'])\n", - "\n", - "baso_rel = data[(data['type'] == 'Basophile') & (data['unit'] == '%')]\n", - "baso_rel['type'] = 'baso_rel'\n", - "lab_data = pd.concat([lab_data, baso_rel], ignore_index=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Total Bilirubin = Gesamt Bilirubin\n", - "lab_data = append('tbil', data, lab_data, ['Bilirubin', 'Bilirubin gesamt Se', 'Bilirubin, gesamt', 'Bilirubin, gesamt HP', 'Bilirubin, total', 'Bilirubin, total (HP)', 'tBil'], ['mg/dl', 'mg/dL'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Bilirubin Direct = Bilirubin Direkt\n", - "lab_data = append('dbil', data, lab_data, ['Bilirubin direkt Se', 'Bilirubin, conjugiert', 'Bilirubin, direkt', 'Bilirubin, direkt (HP)', 'Bilirubin, direkt HP'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Bilirubin Indirect = Bilirubin Indirekt\n", - "lab_data = append('ibil', data, lab_data, ['Bilirubin indirekt'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# - = Potenzial von Wasserstoff\n", - "lab_data = append('ph', data, lab_data, ['Blut-pH-Wert', 'pH', 'pH-Wert'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Creatine Kinase = Kreatin Kinase\n", - "lab_data = append('ck', data, lab_data, ['CK', 'CK (HP)', 'Creatinkinase (CK)', 'Creatinkinase (CK) HP', 'Creatinkinase (CK) Se'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Creatine Kinase MB = Kreatin Kinase MB\n", - "lab_data = append('ck_mb', data, lab_data, ['CK-MB', 'CK-MB (HP)', 'CK-MB HP', 'CK-MB Se'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Carboxyhemoglobin = Carboxyhämoglobin\n", - "lab_data = append('cohb', data, lab_data, ['CO-Hb', 'COHb'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# C Reactive Protein = C Reaktives Protein\n", - "lab_data = append('crp', data, lab_data, ['CRP', 'CRP (HP)', 'CRP HP', 'CRP Se'], ['mg/l'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Calcium = Kalzium\n", - "lab_data = append('ca', data, lab_data, ['Ca++', 'Calcium', 'Calcium (HP)', 'Calcium Se'], ['mmol/L', 'mmol/l'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Chloride = Chlorid\n", - "lab_data = append('cl', data, lab_data, ['Chlorid', 'Chlorid (HP)', 'Chlorid Se', 'Cl-'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Total Cholesterol = Gesamt Cholesterin\n", - "lab_data = append('tc', data, lab_data, ['Cholesterin', 'ges.Cholesterin', 'ges.Cholesterin HP', 'ges.Cholesterin Se'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Creatinine = Kreatinin\n", - "lab_data = append('cr', data, lab_data, ['Creatinin', 'Creatinin (enz)', 'Creatinin (enzymat.)', 'Kreatinin', 'Kreatinin (Jaffé)', 'Kreatinin (Jaffé) (HP)', 'Kreatinin (Jaffé) HP', 'Kreatinin (Jaffé) Se', 'Kreatinin (enzym.)', 'Kreatinin (enzym.) HP', 'Kreatinin (enzym.) Se'], ['mg/dl'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Cystatin C = Cystatin C\n", - "lab_data = append('cys_c', data, lab_data, ['Cystatin C', 'Cystatin C HP', 'Cystatin C Se'], ['mg/l'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# D Dimer = D Dimere\n", - "lab_data = append('d_dim', data, lab_data, ['D-Dimer', 'D-Dimere'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Iron = Eisen\n", - "lab_data = append('fe', data, lab_data, ['Eisen', 'Eisen (HP)', 'Eisen Se'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Eosinophils = Eosinophile\n", - "lab_data = append('eos', data, lab_data, ['Eosinophile absolut'])\n", - "\n", - "eos = data[(data['type'] == 'Eosinophile') & (data['unit'] == '/nl')]\n", - "eos['type'] = 'eos'\n", - "lab_data = pd.concat([lab_data, eos], ignore_index=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Eosinophils Relative = Eosinophile Relativ\n", - "lab_data = append('eos_rel', data, lab_data, ['Eosinophile %'])\n", - "\n", - "eos_rel = data[(data['type'] == 'Eosinophile') & (data['unit'] == '%')]\n", - "eos_rel['type'] = 'eos_rel'\n", - "lab_data = pd.concat([lab_data, eos_rel], ignore_index=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Erythroblasts = Erythroblasten\n", - "lab_data = append('ebl', data, lab_data, ['Erythroblasten absolut'])\n", - "\n", - "ebl = data[(data['type'] == 'Erythroblasten') & (data['unit'] == '/nl')]\n", - "ebl['type'] = 'ebl'\n", - "lab_data = pd.concat([lab_data, ebl], ignore_index=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Erythroblasts Relative = Erythroblasten Relativ\n", - "lab_data = append('ebl_rel', data, lab_data, ['Erythroblasten %'])\n", - "\n", - "ebl_rel = data[(data['type'] == 'Erythroblasten') & (data['unit'] == '%')]\n", - "ebl_rel['type'] = 'ebl_rel'\n", - "lab_data = pd.concat([lab_data, ebl_rel], ignore_index=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Erythrocytes = Erythrozyten\n", - "lab_data = append('rbc', data, lab_data, ['Erythrozyten'], ['/pl'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Fraction of Inspired Oxygen = Inspiratorische Sauerstofffraktion\n", - "lab_data = append('fio2', data, lab_data, ['FIO2'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Ferritin = Ferritin\n", - "lab_data = append('fer', data, lab_data, ['Ferritin', 'Ferritin HP', 'Ferritin SE', 'Ferritin Se'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Fibrinogen = Fibrinogen\n", - "lab_data = append('fg', data, lab_data, ['Fibrinogen'], ['g/l'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Schistocytes = Fragmentozyten\n", - "lab_data = append('schisto', data, lab_data, ['Fragmentozyten'], ['%'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Gamma Glutamyltransferase = Gamma Glutamyltransferase\n", - "lab_data = append('ggt', data, lab_data, ['GGT', 'GGT (HP)', 'gamma-GT', 'gamma-GT HP', 'gamma-GT Se'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Glutamate Dehydrogenase = Glutamat Dehydrogenase\n", - "lab_data = append('gdh', data, lab_data, ['GLDH', 'GLDH HP', 'GLDH Se'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Glucose = Glukose\n", - "lab_data = append('glu', data, lab_data, ['GLU', 'Glu', 'Glucose', 'Glucose HP', 'Glucose Se'], ['mg/dl', 'mg/dL'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# ASAT (GOT) = ASAT (GOT)\n", - "lab_data = append('asat', data, lab_data, ['GOT (AST)', 'GOT (AST) (HP)', 'GOT (AST) HP', 'GOT (AST) Se'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# ALAT (GPT) = ALAT (GPT)\n", - "lab_data = append('alat', data, lab_data, ['GPT (ALT)', 'GPT (ALT) (HP)', 'GPT (ALT) HP', 'GPT (ALT) Se'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Bicarbonate = Bikarbonat\n", - "lab_data = append('hco3', data, lab_data, ['HCO3-', 'SBC', 'Standard Bicarbonat', 'Standardbicarbonat', 'aktuelles Bicarbonat'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# High Density Lipoprotein = HDL Cholesterin\n", - "lab_data = append('hdl', data, lab_data, ['HDL-Cholesterin', 'HDL-Cholesterin HP', 'HDL-Cholesterin Se'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Deoxyhemoglobin = Desoxyhämoglobin\n", - "lab_data = append('hhb', data, lab_data, ['HHb'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Haptoglobin = Haptoglobin\n", - "lab_data = append('hp', data, lab_data, ['Haptoglobin', 'Haptoglobin HP', 'Haptoglobin Se'], ['g/l'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Urea = Harnstoff\n", - "lab_data = append('urea', data, lab_data, ['Harnstoff', 'Harnstoff (HP)', 'Harnstoff HP', 'Harnstoff Se'], ['mg/dl'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Uric Acid = Harnsäure\n", - "lab_data = append('ua', data, lab_data, ['Harnsäure', 'Harnsäure (HP)', 'Harnsäure HP', 'Harnsäure Se'], ['mg/dl'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Hemoglobin = Hämoglobin\n", - "lab_data = append('hb', data, lab_data, ['Hb', 'Hämoglobin', 'tHb'], ['g/dl', 'g/dL'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Glycated Hemoglobin = Glykosyliertes Hämoglobin\n", - "lab_data = append('hba1c', data, lab_data, ['HbA1c', 'HbA1c (EDTA)'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Hematocrit = Hämatokrit\n", - "lab_data = append('hct', data, lab_data, ['Hct', 'Hämatokrit', 'Hämatokrit (l/l)'], ['%'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# I/T Ratio = I/T Quotient\n", - "lab_data = append('it_ratio', data, lab_data, ['I/T Quotient maschinell'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# International Normalized Ratio = International Normalized Ratio\n", - "lab_data = append('inr', data, lab_data, ['INR', 'TPZ-INR'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Immature Platelet Fraction = Unreife Thrombozytenfraktion\n", - "lab_data = append('ipf', data, lab_data, ['Immature Plättchenfraktion'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Immunoglobulin A = Immunoglobulin A\n", - "lab_data = append('iga', data, lab_data, ['Immunglobulin A', 'Immunglobulin A HP', 'Immunglobulin A Se'], ['g/l'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Immunoglobulin E = Immunoglobulin E\n", - "lab_data = append('ige', data, lab_data, ['Immunglobulin E', 'Immunglobulin E HP', 'Immunglobulin E Se'], ['kU/l'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Immunoglobulin G = Immunoglobulin G\n", - "lab_data = append('igg', data, lab_data, ['Immunglobulin G', 'Immunglobulin G HP', 'Immunglobulin G Se'], ['g/l'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Immunoglobulin M = Immunoglobulin M\n", - "lab_data = append('igm', data, lab_data, ['Immunglobulin M', 'Immunglobulin M HP', 'Immunglobulin M Se'], ['g/l'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Potassium = Kalium\n", - "lab_data = append('k', data, lab_data, ['K+', 'Kalium', 'Kalium HP', 'Kalium Se'], ['mmol/L', 'mmol/l'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Lactate Dehydrogenase = Laktatdehydrogenase\n", - "lab_data = append('ldh', data, lab_data, ['LDH', 'LDH (HP)', 'LDH HP', 'LDH Se'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Low Density Lipoprotein = LDL Cholesterin\n", - "lab_data = append('ldl', data, lab_data, ['LDL-Cholesterin', 'LDL-Cholesterin HP', 'LDL-Cholesterin Se'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Lactate = Laktat\n", - "lab_data = append('lac', data, lab_data, ['Lac', 'Lactat', 'Laktat'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Leukocytes = Leukozyten\n", - "lab_data = append('wbc', data, lab_data, ['Leukozyten'], ['/nl'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Lipase = Lipase\n", - "lab_data = append('lps', data, lab_data, ['Lipase', 'Lipase (HP)', 'Lipase HP', 'Lipase Se'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Lymphocytes = Lymphocytes\n", - "lab_data = append('lym', data, lab_data, ['Lymphozyten abs.', 'Lymphozyten absolut'], ['/nl'])\n", - "\n", - "lym = data[(data['type'] == 'Lymphozyten') & (data['unit'] == '/nl')]\n", - "lym['type'] = 'lym'\n", - "lab_data = pd.concat([lab_data, lym], ignore_index=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Lymphocytes Relative = Lymphocytes Relativ\n", - "lab_data = append('lym_rel', data, lab_data, ['Lymphozyten %', 'Lymphozyten rel.'])\n", - "\n", - "lym_rel = data[(data['type'] == 'Lymphozyten') & (data['unit'] == '%')]\n", - "lym_rel['type'] = 'lym_rel'\n", - "lab_data = pd.concat([lab_data, lym_rel], ignore_index=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Mean Corpuscular Hemoglobin = Mittleres Korpuskulares Hämoglobin\n", - "lab_data = append('mch', data, lab_data, ['MCH'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Mean Corpuscular Hemoglobin Concentration = Mittlere Korpusukuläre Hämoglobin Konzentration\n", - "lab_data = append('mchc', data, lab_data, ['MCHC'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Mean Corpuscular Volume = Mittleres Korpuskuläres Volumen\n", - "lab_data = append('mcv', data, lab_data, ['MCV'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Mean Platelet Volume = Mittleres Thrombozytenvolumen\n", - "lab_data = append('mpv', data, lab_data, ['MPV'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Magnesium = Magnesium\n", - "lab_data = append('mg', data, lab_data, ['Magnesium', 'Magnesium (HP)', 'Magnesium Se'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Methemoglobin = Methämoglobin\n", - "lab_data = append('methb', data, lab_data, ['MetHb'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Monocytes = Monocytes\n", - "lab_data = append('mono', data, lab_data, ['Monozyten abs.', 'Monozyten absolut'])\n", - "\n", - "mono = data[(data['type'] == 'Monozyten') & (data['unit'] == '/nl')]\n", - "mono['type'] = 'mono'\n", - "lab_data = pd.concat([lab_data, mono], ignore_index=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Monocytes Relative = Monocytes Relativ\n", - "lab_data = append('mono_rel', data, lab_data, ['Monozyten %', 'Monozyten rel.'])\n", - "\n", - "mono_rel = data[(data['type'] == 'Monozyten') & (data['unit'] == '%')]\n", - "mono_rel['type'] = 'mono_rel'\n", - "lab_data = pd.concat([lab_data, mono_rel], ignore_index=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Myelocytes = Myelozyten\n", - "lab_data = append('myelo', data, lab_data, ['Myelozyten'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Myoglobin = Myoglobin\n", - "lab_data = append('mb', data, lab_data, ['Myoglobin', 'Myoglobin HP', 'Myoglobin Se'], ['µg/l'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# N Terminal Pro B Type Natriuretic Peptide = N Terminal Pro B Type Natriuretic Peptide\n", - "lab_data = append('nt_probnp', data, lab_data, ['NT pro BNP', 'NT-pro BNP', 'NT-pro BNP (HP)'], ['ng/l'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Sodium = Natrium\n", - "lab_data = append('na', data, lab_data, ['Na+', 'Natrium', 'Natrium HP', 'Natrium Se'], ['mmol/L', 'mmol/l'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Neutrophils = Neutrophile\n", - "lab_data = append('pmn', data, lab_data, ['Neutrophile absolut'])\n", - "\n", - "pmn = data[(data['type'] == 'Neutrophile') & (data['unit'] == '/nl')]\n", - "pmn['type'] = 'pmn'\n", - "lab_data = pd.concat([lab_data, pmn], ignore_index=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Neutrophils Relative = Neutrophile Relativ\n", - "lab_data = append('pmn_rel', data, lab_data, ['Neutrophile %'])\n", - "\n", - "pmn_rel = data[(data['type'] == 'Neutrophile') & (data['unit'] == '%')]\n", - "pmn_rel['type'] = 'pmn_rel'\n", - "lab_data = pd.concat([lab_data, pmn_rel], ignore_index=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Oxygen Saturation = Sauerstoffsättigung\n", - "lab_data = append('so2', data, lab_data, ['O2-Sättigung', 'sO2'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Oxyhemoglobin = Oxyhämoglobin\n", - "lab_data = append('o2hb', data, lab_data, ['O2Hb'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Phosphorus = Phosphor\n", - "lab_data = append('p', data, lab_data, ['Phosphor, anorg.'], ['mmol/l'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Procalcitonin = Procalcitonin\n", - "lab_data = append('pct', data, lab_data, ['Procalcitonin', 'Procalcitonin (HP)', 'Procalcitonin HP', 'Procalcitonin Se'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Protein = Protein\n", - "lab_data = append('pro', data, lab_data, ['Protein', 'Protein HP'], ['g/l'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Pseudocholinesterase = Pseudocholinesterase\n", - "lab_data = append('pche', data, lab_data, ['PCHE', 'PCHE (HP)', 'Pseudo-Cholinesterase', 'Pseudo-Cholinesterase HP', 'Pseudo-Cholinesterase Se'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Quick Value = Quick Wert\n", - "lab_data = append('quick', data, lab_data, ['Quick (TPZ)'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Red Cell Distribution Width = Erythrozytenverteilungsbreite\n", - "lab_data = append('rdw', data, lab_data, ['RDW', 'RDW-CV'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Reticulocytes = Retikulozyten\n", - "lab_data = append('rtic', data, lab_data, ['Retikulozyten'], ['/nl'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Temperature = Temperatur\n", - "lab_data = append('t', data, lab_data, ['T', 'Temperatur'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Prothrombin Time = Thromboplastinzeit\n", - "lab_data = append('pt', data, lab_data, ['TPZ-Wert'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Thyroid Stimulating Hormone = Schilddrüsenstimulierendes Hormon\n", - "lab_data = append('tsh', data, lab_data, ['TSH', 'TSH bas.', 'TSH bas. Se', 'TSH bas. i.Se', 'TSH basal', 'TSH basal (HP)', 'TSH basal Se'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Platelets = Thrombozyten\n", - "lab_data = append('plt', data, lab_data, ['Thrombozyten'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Transferrin = Transferrin\n", - "lab_data = append('trans', data, lab_data, ['Transferrin', 'Transferrin HP', 'Transferrin Se'], ['g/l'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Transferrin Saturation = Transferrinsättigung\n", - "lab_data = append('ts', data, lab_data, ['Transferrin-Sättigung', 'Transferrin-Sättigung HP', 'Transferrin-Sättigung Se', 'Transferrinsättigung'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Total Triglycerides = Gesamt Triglyceride\n", - "lab_data = append('tg', data, lab_data, ['Triglyceride', 'Triglyceride HP', 'Triglyceride Se'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# = Partielle Thromboplastinzeit\n", - "lab_data = append('aptt', data, lab_data, ['aPTT'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Phosphate = Phosphat\n", - "lab_data = append('po4', data, lab_data, ['anorg. PO4 HP', 'anorg. PO4 Se'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Carbon Dioxide Partial Pressure = Kohlendioxidpartialdruck\n", - "lab_data = append('pco2', data, lab_data, ['pCO2'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Oxygen Partial Pressure = Sauerstoffpartialdruck\n", - "lab_data = append('po2', data, lab_data, ['pO2'])" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Immature Granulocytes = Unreife Granulozyten\n", - "lab_data = append('ig', data, lab_data, ['unreife Granulozyten absolut'])\n", - "\n", - "ig = data[(data['type'] == 'unreife Granulozyten') & (data['unit'] == '/nl')]\n", - "ig['type'] = 'ig'\n", - "lab_data = pd.concat([lab_data, ig], ignore_index=True)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# Immature Granulocytes Relative = Unreife Granulozyten Relativ\n", - "lab_data = append('ig_rel', data, lab_data, ['unreife Granulozyten %'])\n", - "\n", - "ig_rel = data[(data['type'] == 'unreife Granulozyten') & (data['unit'] == '%')]\n", - "ig_rel['type'] = 'immature_granulocytes_relative'\n", - "lab_data = pd.concat([lab_data, ig_rel], ignore_index=True)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Clean" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "lab_data['value'] = lab_data['value'].apply(pd.to_numeric, errors='coerce')\n", - "lab_data['value'] = lab_data['value'].astype(float)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# case_id\n", - "lab_data['case'] = lab_data['case'].astype(int)\n", - "\n", - "# event_ts\n", - "lab_data['timestamp'] = pd.to_datetime(lab_data['timestamp'])\n", - "\n", - "# type\n", - "lab_data['type'] = lab_data['type'].astype(str)\n", - "\n", - "# drop unit column\n", - "lab_data.drop(columns=['unit'], inplace=True)\n", - "\n", - "# drop nan\n", - "lab_data.dropna(inplace=True)\n", - "\n", - "# drop duplicates\n", - "lab_data.drop_duplicates(keep = 'first', inplace = True)\n", - "\n", - "# sort by event_ts\n", - "lab_data.sort_values(by=['timestamp'], inplace=True)\n", - "\n", - "lab_data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "# import \n", - "included_cases = pd.read_csv('../4_cases/4_3_clean.csv')\n", - "\n", - "# drop all lab_data rows that are not in included_cases and print the number of deleted rows\n", - "print('Length of lab data before: ' + str(len(lab_data)))\n", - "print('Number of unique cases in lab data before: ' + str(lab_data['case'].nunique()))\n", - "lab_data = lab_data[lab_data['case'].isin(included_cases['case'])]\n", - "print('Length of lab data after: ' + str(len(lab_data)))\n", - "print('Number of unique cases in lab data after: ' + str(lab_data['case'].nunique()))\n", - "\n", - "lab_data" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Save" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "lab_data.to_csv('8_3_clean.csv', index=False)\n", - "lab_data" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "types = lab_data.groupby('type').size()\n", - "types.to_csv(f'8_4_types.csv', index = True)\n", - "types" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "base", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.7" - }, - "orig_nbformat": 4 - }, - "nbformat": 4, - "nbformat_minor": 2 -}