diff --git a/api/data/constants.py b/api/data/constants.py index c455472..f093568 100644 --- a/api/data/constants.py +++ b/api/data/constants.py @@ -26,7 +26,7 @@ GNOMAD_PATH = os.path.join(DATA_PATH, "gnomad/") CLINVAR_PATH = os.path.join(DATA_PATH, "clinvar/") DEFAULT_SAVE_PATH = os.path.join(DATA_PATH, "merged_data/") -SAVE_LOVD_GNOMAD = "../data/merged_lovd_gnomad/lovd_gnomad.csv" +SAVE_LOVD_GNOMAD = "../data/merged_data/lovd_gnomad.csv" # variable data types LOVD_TABLES_DATA_TYPES = { diff --git a/api/data/refactoring.py b/api/data/refactoring.py index 6efc1bb..ee03fbc 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -349,19 +349,25 @@ def find_popmax_in_gnomad(data): data.loc[i, 'Popmax population'] = population_mapping[max_id] -def routing_merge(lovd_path:str=LOVD_PATH,gnomad_path:str=GNOMAD_PATH,save_path:str=DEFAULT_SAVE_PATH,overwrite:bool=False): +def routing_merge(lovd_path:str=LOVD_PATH, + gnomad_path:str=GNOMAD_PATH, + save_path:str=SAVE_LOVD_GNOMAD, + overwrite:bool=False): """ Merges data from provided paths and saves to new location :param overwrite: does file requires overwriting :param lovd_path: path to LOVD dataframe :param gnomad_path: path to gnomAD dataframe :param save_path: path where to save merged data - :return: + :return: None """ if overwrite: return + if not os.path.exists(os.path.dirname(save_path)): + os.makedirs(os.path.dirname(save_path)) + lovd_file = os.path.join(lovd_path, "lovd_data.txt") gnomad_file = os.path.join(gnomad_path, "gnomad_data.csv") @@ -379,6 +385,7 @@ def routing_merge(lovd_path:str=LOVD_PATH,gnomad_path:str=GNOMAD_PATH,save_path: # Extract "Variants_On_Genome" and merge it with "Variants_On_Transcripts" variants_on_genome = lovd_data["Variants_On_Genome"].copy() + gnomad_data = gnomad_data.copy() lovd_data = pd.merge( lovd_data["Variants_On_Transcripts"], @@ -387,22 +394,10 @@ def routing_merge(lovd_path:str=LOVD_PATH,gnomad_path:str=GNOMAD_PATH,save_path: how='left' ) - # Copy gnomAD data and merge with LOVD data - gnomad_data = gnomad_data.copy() final_data = merge_gnomad_lovd(lovd_data, gnomad_data) - if not os.path.exists(os.path.dirname(save_path)): - os.makedirs(os.path.dirname(save_path)) try: - final_data.to_csv(SAVE_LOVD_GNOMAD) + final_data.to_csv(save_path) print(f"Merged data saved to {save_path}") except OSError as e: print(f"Error saving file: {e}") - - save_to = SAVE_LOVD_GNOMAD - - # check if directory exists, if not - create - save_to_dir = os.path.dirname(save_to) - if not os.path.exists(save_to_dir): - os.makedirs(save_to_dir) - diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index b92df1c..55d7af3 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -1702,36 +1702,6 @@ } }, "cell_type": "code", - "source": [ - "from api.data.refactoring import merge_gnomad_lovd, parse_gnomad, set_gnomad_dtypes\n", - "import pandas as pd\n", - "from api import (store_database_for_eys_gene,\n", - " parse_lovd,\n", - " set_lovd_dtypes,\n", - " LOVD_PATH,\n", - " GNOMAD_PATH)\n", - "\n", - "store_database_for_eys_gene('lovd', False)\n", - "store_database_for_eys_gene('gnomad', False)\n", - "\n", - "lovd_data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")\n", - "gnomad_data = parse_gnomad(GNOMAD_PATH+'/gnomad_data.csv')\n", - "\n", - "set_lovd_dtypes(lovd_data)\n", - "set_gnomad_dtypes(gnomad_data)\n", - "\n", - "variants_on_genome = lovd_data[\"Variants_On_Genome\"].copy()\n", - "\n", - "lovd_data = pd.merge(lovd_data[\"Variants_On_Transcripts\"],\n", - " variants_on_genome[['id','VariantOnGenome/DNA','VariantOnGenome/DNA/hg38']],\n", - " on='id',\n", - " how='left')\n", - "\n", - "gnomad_data = gnomad_data.copy()\n", - "final_data = merge_gnomad_lovd(lovd_data, gnomad_data)\n", - "final_data" - ], - "id": "d86fa6b925aea085", "outputs": [ { "name": "stdout", @@ -2180,19 +2150,52 @@ "output_type": "execute_result" } ], - "execution_count": 1 + "source": [ + "from api.data.refactoring import merge_gnomad_lovd, parse_gnomad, set_gnomad_dtypes\n", + "import pandas as pd\n", + "from api import (store_database_for_eys_gene,\n", + " parse_lovd,\n", + " set_lovd_dtypes,\n", + " LOVD_PATH,\n", + " GNOMAD_PATH)\n", + "\n", + "store_database_for_eys_gene('lovd', False)\n", + "store_database_for_eys_gene('gnomad', False)\n", + "\n", + "lovd_data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")\n", + "gnomad_data = parse_gnomad(GNOMAD_PATH+'/gnomad_data.csv')\n", + "\n", + "set_lovd_dtypes(lovd_data)\n", + "set_gnomad_dtypes(gnomad_data)\n", + "\n", + "variants_on_genome = lovd_data[\"Variants_On_Genome\"].copy()\n", + "\n", + "lovd_data = pd.merge(lovd_data[\"Variants_On_Transcripts\"],\n", + " variants_on_genome[['id','VariantOnGenome/DNA','VariantOnGenome/DNA/hg38']],\n", + " on='id',\n", + " how='left')\n", + "\n", + "gnomad_data = gnomad_data.copy()\n", + "final_data = merge_gnomad_lovd(lovd_data, gnomad_data)\n", + "final_data" + ], + "id": "d86fa6b925aea085", + "execution_count": null }, { "metadata": { "ExecuteTime": { - "end_time": "2024-09-19T16:22:52.431348Z", - "start_time": "2024-09-19T16:22:43.027091Z" + "end_time": "2024-09-19T17:05:52.108543Z", + "start_time": "2024-09-19T17:04:55.710489Z" } }, "cell_type": "code", "source": [ "from api.data.refactoring import routing_merge\n", + "from api import store_database_for_eys_gene\n", "\n", + "store_database_for_eys_gene('lovd', False)\n", + "store_database_for_eys_gene('gnomad', False)\n", "routing_merge()" ], "id": "29ecf5e58e3d53e4", @@ -2201,7 +2204,7 @@ "name": "stdout", "output_type": "stream", "text": [ - "Merged data saved to C:\\Users\\Vlad\\PycharmProjects\\kath\\data/merged_data/\n" + "Merged data saved to ../data/merged_data/lovd_gnomad.csv\n" ] } ], @@ -2211,9 +2214,9 @@ "metadata": {}, "cell_type": "code", "outputs": [], - "execution_count": null, "source": "", - "id": "b5eedffd56faee1d" + "id": "b5eedffd56faee1d", + "execution_count": null } ], "metadata": {