diff --git a/data_collection/__init__.py b/data_collection/__init__.py index 0921646..a963013 100644 --- a/data_collection/__init__.py +++ b/data_collection/__init__.py @@ -54,5 +54,8 @@ # Functions for refactoring data convert_lovd_to_datatype, parse_lovd, - from_clinvar_name_to_cdna_position + from_clinvar_name_to_cdna_position, + + # Functions for data merging + merge_lovd_with_gnomad, ) diff --git a/data_collection/pipeline.py b/data_collection/pipeline.py index 4c5b402..a1670a0 100644 --- a/data_collection/pipeline.py +++ b/data_collection/pipeline.py @@ -4,7 +4,8 @@ import pandas as pd from .collection import store_database_for_eys_gene -from .refactoring import parse_lovd, convert_lovd_to_datatype, from_clinvar_name_to_cdna_position +from .refactoring import (parse_lovd, convert_lovd_to_datatype, + from_clinvar_name_to_cdna_position, merge_lovd_with_gnomad) from .constants import (DATA_PATH, LOVD_PATH, GNOMAD_PATH, @@ -89,13 +90,7 @@ def main(): on=["VariantOnTranscript/DNA"]).drop("Name(clinvar)", axis=1) # MERGING GnomAd - main_frame = (pd.merge(main_frame, - gnomad_data, - how="left", - left_on="VariantOnTranscript/DNA", - right_on="HGVS Consequence(gnomad)"). - drop("HGVS Consequence(gnomad)", - axis=1)) + main_frame = merge_lovd_with_gnomad(main_frame, gnomad_data) # Calculating frequencies lovd_without_association_in_gnomad = pd.isnull(main_frame["Hemizygote Count Remaining(gnomad)"]) diff --git a/data_collection/refactoring.py b/data_collection/refactoring.py index 9329a62..ce26217 100644 --- a/data_collection/refactoring.py +++ b/data_collection/refactoring.py @@ -124,3 +124,19 @@ def from_clinvar_name_to_cdna_position(name): break return name[start:end] + +def merge_lovd_with_gnomad(lovd_data, gnomad_data): + """ + Merges LOVD and GNOMAD data based on DNA variant. + + :param DataFrame lovd_data: LOVD data + :param DataFrame gnomad_data: GNOMAD data + :returns: merged data + :rtype: DataFrame + """ + + return pd.merge(lovd_data, + gnomad_data, + how="outer", + left_on="VariantOnTranscript/DNA", + right_on="Transcript Consequence(gnomad)")