From 1b6ac7d5bc2069e97cdaa70470729ebc77e1e27c Mon Sep 17 00:00:00 2001 From: Vladyslav Levchenko Date: Wed, 25 Sep 2024 22:44:57 +0300 Subject: [PATCH] move content of routing merge to app/back-end/src/routes/workspace_merge_route --- api/data/refactoring.py | 53 ------------------- .../src/routes/workspace_merge_route.py | 41 ++++++++++++++ tests/pipeline.ipynb | 42 --------------- 3 files changed, 41 insertions(+), 95 deletions(-) diff --git a/api/data/refactoring.py b/api/data/refactoring.py index 8710912..44d600c 100644 --- a/api/data/refactoring.py +++ b/api/data/refactoring.py @@ -346,56 +346,3 @@ def find_popmax_in_gnomad(data): max_id = population_id data.loc[i, 'Popmax'] = max_pop data.loc[i, 'Popmax population'] = population_mapping[max_id] - - -def routing_merge(lovd_path:str=LOVD_PATH, - gnomad_path:str=GNOMAD_PATH, - save_path:str=DEFAULT_SAVE_PATH, - overwrite:bool=False): - """ - Merges data from provided paths and saves to new location - :param overwrite: does file requires overwriting - :param lovd_path: path to LOVD dataframe - :param gnomad_path: path to gnomAD dataframe - :param save_path: path where to save merged data - :return: None - """ - - save_as = os.path.join(save_path, "lovd_gnomad.csv") - - if os.path.exists(save_as) and not overwrite: - return - - if not os.path.exists(save_path): - os.makedirs(save_path) - - if not os.path.exists(os.path.join(lovd_path, "lovd_data.txt")): - raise FileNotFoundError(f"LOVD data file not found at: {lovd_path}") - - if not os.path.exists(os.path.join(gnomad_path, "gnomad_data.csv")): - raise FileNotFoundError(f"gnomAD data file not found at: {gnomad_path}") - - lovd_data = parse_lovd(lovd_path + "/lovd_data.txt") - gnomad_data = parse_gnomad(gnomad_path + '/gnomad_data.csv') - - set_lovd_dtypes(lovd_data) - set_gnomad_dtypes(gnomad_data) - - # Extract "Variants_On_Genome" and merge it with "Variants_On_Transcripts" - variants_on_genome = lovd_data["Variants_On_Genome"].copy() - gnomad_data = gnomad_data.copy() - - lovd_data = pd.merge( - lovd_data["Variants_On_Transcripts"], - variants_on_genome[['id', 'VariantOnGenome/DNA', 'VariantOnGenome/DNA/hg38']], - on='id', - how='left' - ) - - final_data = merge_gnomad_lovd(lovd_data, gnomad_data) - - try: - final_data.to_csv(save_as) - print(f"Merged data saved to {save_path}") - except OSError as e: - print(f"Error saving file: {e}") diff --git a/app/back-end/src/routes/workspace_merge_route.py b/app/back-end/src/routes/workspace_merge_route.py index 6de3f6c..bc56954 100644 --- a/app/back-end/src/routes/workspace_merge_route.py +++ b/app/back-end/src/routes/workspace_merge_route.py @@ -8,6 +8,8 @@ import os import time # TODO: Remove this import once the merge logic is implemented + +import pandas as pd from flask import Blueprint, request, jsonify from src.setup.extensions import logger @@ -20,6 +22,9 @@ WORKSPACE_UPDATE_FEEDBACK_EVENT, ) +from api import set_lovd_dtypes, parse_lovd +from api.data import merge_gnomad_lovd, set_gnomad_dtypes, parse_gnomad + workspace_merge_route_bp = Blueprint("workspace_merge_route", __name__) @@ -85,6 +90,42 @@ def get_workspace_merge_lovd_gnomad(relative_path): # [destination_path, override, lovd_file, gnomad_file] # + if os.path.exists(destination_path) and not override: + return + + if not os.path.exists(destination_path): + os.makedirs(destination_path) + + if not os.path.exists(lovd_file): + raise FileNotFoundError(f"LOVD data file not found at: {lovd_file}") + + if not os.path.exists(gnomad_file): + raise FileNotFoundError(f"gnomAD data file not found at: {gnomad_file}") + + lovd_data = parse_lovd(lovd_file) + gnomad_data = parse_gnomad(gnomad_file) + + set_lovd_dtypes(lovd_data) + set_gnomad_dtypes(gnomad_data) + + # Extract "Variants_On_Genome" and merge it with "Variants_On_Transcripts" + variants_on_genome = lovd_data["Variants_On_Genome"].copy() + gnomad_data = gnomad_data.copy() + + lovd_data = pd.merge( + lovd_data["Variants_On_Transcripts"], + variants_on_genome[['id', 'VariantOnGenome/DNA', 'VariantOnGenome/DNA/hg38']], + on='id', + how='left' + ) + + final_data = merge_gnomad_lovd(lovd_data, gnomad_data) + + try: + final_data.to_csv(destination_path) + except OSError as e: + raise RuntimeError(f"Error saving file: {e}") + # TODO: Remove this sleep statement once the merge logic is implemented time.sleep(1) # Simulate a delay for the merge process diff --git a/tests/pipeline.ipynb b/tests/pipeline.ipynb index 869bb57..6b3f75b 100644 --- a/tests/pipeline.ipynb +++ b/tests/pipeline.ipynb @@ -2181,48 +2181,6 @@ ], "id": "d86fa6b925aea085", "execution_count": null - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-22T13:05:20.884141Z", - "start_time": "2024-09-22T13:05:19.105877Z" - } - }, - "cell_type": "code", - "source": [ - "from api.data.refactoring import routing_merge\n", - "routing_merge()" - ], - "id": "29ecf5e58e3d53e4", - "outputs": [], - "execution_count": 1 - }, - { - "metadata": { - "ExecuteTime": { - "end_time": "2024-09-22T13:03:46.634257Z", - "start_time": "2024-09-22T13:02:58.726380Z" - } - }, - "cell_type": "code", - "source": [ - "from api import store_database_for_eys_gene\n", - "\n", - "store_database_for_eys_gene('lovd', False)\n", - "store_database_for_eys_gene('gnomad', False)" - ], - "id": "b5eedffd56faee1d", - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The file at ../data/lovd/lovd_data.txt already exists.\n" - ] - } - ], - "execution_count": 1 } ], "metadata": {