Skip to content

Commit

Permalink
routing merge fix
Browse files Browse the repository at this point in the history
  • Loading branch information
Akaud committed Sep 19, 2024
1 parent fbe20ef commit b93afbb
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 52 deletions.
2 changes: 1 addition & 1 deletion api/data/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@
GNOMAD_PATH = os.path.join(DATA_PATH, "gnomad/")
CLINVAR_PATH = os.path.join(DATA_PATH, "clinvar/")
DEFAULT_SAVE_PATH = os.path.join(DATA_PATH, "merged_data/")
SAVE_LOVD_GNOMAD = "../data/merged_lovd_gnomad/lovd_gnomad.csv"
SAVE_LOVD_GNOMAD = "../data/merged_data/lovd_gnomad.csv"

# variable data types
LOVD_TABLES_DATA_TYPES = {
Expand Down
25 changes: 10 additions & 15 deletions api/data/refactoring.py
Original file line number Diff line number Diff line change
Expand Up @@ -349,19 +349,25 @@ def find_popmax_in_gnomad(data):
data.loc[i, 'Popmax population'] = population_mapping[max_id]


def routing_merge(lovd_path:str=LOVD_PATH,gnomad_path:str=GNOMAD_PATH,save_path:str=DEFAULT_SAVE_PATH,overwrite:bool=False):
def routing_merge(lovd_path:str=LOVD_PATH,
gnomad_path:str=GNOMAD_PATH,
save_path:str=SAVE_LOVD_GNOMAD,
overwrite:bool=False):
"""
Merges data from provided paths and saves to new location
:param overwrite: does file requires overwriting
:param lovd_path: path to LOVD dataframe
:param gnomad_path: path to gnomAD dataframe
:param save_path: path where to save merged data
:return:
:return: None
"""

if overwrite:
return

if not os.path.exists(os.path.dirname(save_path)):
os.makedirs(os.path.dirname(save_path))

lovd_file = os.path.join(lovd_path, "lovd_data.txt")
gnomad_file = os.path.join(gnomad_path, "gnomad_data.csv")

Expand All @@ -379,6 +385,7 @@ def routing_merge(lovd_path:str=LOVD_PATH,gnomad_path:str=GNOMAD_PATH,save_path:

# Extract "Variants_On_Genome" and merge it with "Variants_On_Transcripts"
variants_on_genome = lovd_data["Variants_On_Genome"].copy()
gnomad_data = gnomad_data.copy()

lovd_data = pd.merge(
lovd_data["Variants_On_Transcripts"],
Expand All @@ -387,22 +394,10 @@ def routing_merge(lovd_path:str=LOVD_PATH,gnomad_path:str=GNOMAD_PATH,save_path:
how='left'
)

# Copy gnomAD data and merge with LOVD data
gnomad_data = gnomad_data.copy()
final_data = merge_gnomad_lovd(lovd_data, gnomad_data)

if not os.path.exists(os.path.dirname(save_path)):
os.makedirs(os.path.dirname(save_path))
try:
final_data.to_csv(SAVE_LOVD_GNOMAD)
final_data.to_csv(save_path)
print(f"Merged data saved to {save_path}")
except OSError as e:
print(f"Error saving file: {e}")

save_to = SAVE_LOVD_GNOMAD

# check if directory exists, if not - create
save_to_dir = os.path.dirname(save_to)
if not os.path.exists(save_to_dir):
os.makedirs(save_to_dir)

75 changes: 39 additions & 36 deletions tests/pipeline.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -1702,36 +1702,6 @@
}
},
"cell_type": "code",
"source": [
"from api.data.refactoring import merge_gnomad_lovd, parse_gnomad, set_gnomad_dtypes\n",
"import pandas as pd\n",
"from api import (store_database_for_eys_gene,\n",
" parse_lovd,\n",
" set_lovd_dtypes,\n",
" LOVD_PATH,\n",
" GNOMAD_PATH)\n",
"\n",
"store_database_for_eys_gene('lovd', False)\n",
"store_database_for_eys_gene('gnomad', False)\n",
"\n",
"lovd_data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")\n",
"gnomad_data = parse_gnomad(GNOMAD_PATH+'/gnomad_data.csv')\n",
"\n",
"set_lovd_dtypes(lovd_data)\n",
"set_gnomad_dtypes(gnomad_data)\n",
"\n",
"variants_on_genome = lovd_data[\"Variants_On_Genome\"].copy()\n",
"\n",
"lovd_data = pd.merge(lovd_data[\"Variants_On_Transcripts\"],\n",
" variants_on_genome[['id','VariantOnGenome/DNA','VariantOnGenome/DNA/hg38']],\n",
" on='id',\n",
" how='left')\n",
"\n",
"gnomad_data = gnomad_data.copy()\n",
"final_data = merge_gnomad_lovd(lovd_data, gnomad_data)\n",
"final_data"
],
"id": "d86fa6b925aea085",
"outputs": [
{
"name": "stdout",
Expand Down Expand Up @@ -2180,19 +2150,52 @@
"output_type": "execute_result"
}
],
"execution_count": 1
"source": [
"from api.data.refactoring import merge_gnomad_lovd, parse_gnomad, set_gnomad_dtypes\n",
"import pandas as pd\n",
"from api import (store_database_for_eys_gene,\n",
" parse_lovd,\n",
" set_lovd_dtypes,\n",
" LOVD_PATH,\n",
" GNOMAD_PATH)\n",
"\n",
"store_database_for_eys_gene('lovd', False)\n",
"store_database_for_eys_gene('gnomad', False)\n",
"\n",
"lovd_data = parse_lovd(LOVD_PATH + \"/lovd_data.txt\")\n",
"gnomad_data = parse_gnomad(GNOMAD_PATH+'/gnomad_data.csv')\n",
"\n",
"set_lovd_dtypes(lovd_data)\n",
"set_gnomad_dtypes(gnomad_data)\n",
"\n",
"variants_on_genome = lovd_data[\"Variants_On_Genome\"].copy()\n",
"\n",
"lovd_data = pd.merge(lovd_data[\"Variants_On_Transcripts\"],\n",
" variants_on_genome[['id','VariantOnGenome/DNA','VariantOnGenome/DNA/hg38']],\n",
" on='id',\n",
" how='left')\n",
"\n",
"gnomad_data = gnomad_data.copy()\n",
"final_data = merge_gnomad_lovd(lovd_data, gnomad_data)\n",
"final_data"
],
"id": "d86fa6b925aea085",
"execution_count": null
},
{
"metadata": {
"ExecuteTime": {
"end_time": "2024-09-19T16:22:52.431348Z",
"start_time": "2024-09-19T16:22:43.027091Z"
"end_time": "2024-09-19T17:05:52.108543Z",
"start_time": "2024-09-19T17:04:55.710489Z"
}
},
"cell_type": "code",
"source": [
"from api.data.refactoring import routing_merge\n",
"from api import store_database_for_eys_gene\n",
"\n",
"store_database_for_eys_gene('lovd', False)\n",
"store_database_for_eys_gene('gnomad', False)\n",
"routing_merge()"
],
"id": "29ecf5e58e3d53e4",
Expand All @@ -2201,7 +2204,7 @@
"name": "stdout",
"output_type": "stream",
"text": [
"Merged data saved to C:\\Users\\Vlad\\PycharmProjects\\kath\\data/merged_data/\n"
"Merged data saved to ../data/merged_data/lovd_gnomad.csv\n"
]
}
],
Expand All @@ -2211,9 +2214,9 @@
"metadata": {},
"cell_type": "code",
"outputs": [],
"execution_count": null,
"source": "",
"id": "b5eedffd56faee1d"
"id": "b5eedffd56faee1d",
"execution_count": null
}
],
"metadata": {
Expand Down

0 comments on commit b93afbb

Please sign in to comment.