Skip to content

Commit c536b13

Browse files
authored
Merge pull request #12 from DKI/update_pipeline
DKI/update_pipeline
2 parents a20b767 + a9e8e12 commit c536b13

File tree

1 file changed

+83
-72
lines changed

1 file changed

+83
-72
lines changed

data_collection/pipeline.py

Lines changed: 83 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -47,75 +47,86 @@ def calculate_max_frequency(row):
4747
return pd.Series([max_freq, max_pop], index=['PopMax', 'PopMax population'])
4848

4949

50-
# MAIN
51-
# Download all data
52-
store_database_for_eys_gene('lovd', True)
53-
store_database_for_eys_gene('gnomad', True)
54-
store_database_for_eys_gene('clinvar', True)
55-
56-
# Read and convert data
57-
lovd_data = parse_lovd(LOVD_PATH + "/lovd_data.txt")
58-
gnomad_data = pd.read_csv(GNOMAD_PATH + "/gnomad_data.csv")
59-
clinvar_data = pd.read_csv(CLINVAR_PATH + "/clinvar_data.txt", sep='\t')
60-
61-
convert_lovd_to_datatype(lovd_data)
62-
63-
# renaming databases' columns
64-
gnomad_data.columns += "(gnomad)"
65-
clinvar_data.columns += "(clinvar)"
66-
67-
# Reading main working table
68-
main_frame = lovd_data["Variants_On_Transcripts"][0].copy()
69-
notes = lovd_data["Variants_On_Transcripts"][1][::]
70-
71-
# Merging Clinvar
72-
clinvar = clinvar_data.copy()[["Name(clinvar)",
73-
"Germline classification(clinvar)",
74-
"Accession(clinvar)"]]
75-
clinvar["VariantOnTranscript/DNA"] = (clinvar["Name(clinvar)"].
76-
apply(from_clinvar_name_to_cdna_position))
77-
78-
main_frame = pd.merge(main_frame,
79-
clinvar,
80-
how="outer",
81-
on=["VariantOnTranscript/DNA"]).drop("Name(clinvar)", axis=1)
82-
83-
# MERGING GnomAd
84-
main_frame = (pd.merge(main_frame,
85-
gnomad_data,
86-
how="left",
87-
left_on="VariantOnTranscript/DNA",
88-
right_on="HGVS Consequence(gnomad)").drop("HGVS Consequence(gnomad)",
89-
axis=1))
90-
91-
# Calculating frequencies
92-
lovd_without_association_in_gnomad = pd.isnull(main_frame["Hemizygote Count Remaining(gnomad)"])
93-
lovd_with_gnomad = main_frame[~lovd_without_association_in_gnomad].copy()
94-
max_values = lovd_with_gnomad.apply(calculate_max_frequency, axis=1)
95-
lovd_with_gnomad[['PopMax(gnomad)', 'PopMax population(gnomad)']] = max_values
96-
97-
# Leaving necessary columns
98-
99-
lovd_with_gnomad = lovd_with_gnomad.loc[:, ['id',
100-
'transcriptid',
101-
'effectid',
102-
'position_c_start',
103-
'position_c_start_intron',
104-
'position_c_end',
105-
'position_c_end_intron',
106-
'VariantOnTranscript/DNA',
107-
'VariantOnTranscript/RNA',
108-
'VariantOnTranscript/Protein',
109-
'VariantOnTranscript/Exon',
110-
'Germline classification(clinvar)',
111-
'Accession(clinvar)',
112-
'Allele Frequency(gnomad)',
113-
'Homozygote Count(gnomad)',
114-
'PopMax(gnomad)',
115-
'PopMax population(gnomad)']]
116-
117-
# final join
118-
main_frame = main_frame.iloc[:, range(13)]
119-
main_frame = pd.merge(main_frame, lovd_with_gnomad, how="left", on=list(main_frame.columns[:13]))
120-
121-
main_frame.to_csv(DATA_PATH + "/final.csv")
50+
def main():
51+
"""
52+
Main function implementing pipeline for data collection and merging of data from
53+
LOVD, GNOMAD and CLINVAR.
54+
"""
55+
56+
# Download all data
57+
store_database_for_eys_gene('lovd', True)
58+
store_database_for_eys_gene('gnomad', True)
59+
store_database_for_eys_gene('clinvar', True)
60+
61+
# Read and convert data
62+
lovd_data = parse_lovd(LOVD_PATH + "/lovd_data.txt")
63+
gnomad_data = pd.read_csv(GNOMAD_PATH + "/gnomad_data.csv")
64+
clinvar_data = pd.read_csv(CLINVAR_PATH + "/clinvar_data.txt", sep='\t')
65+
66+
convert_lovd_to_datatype(lovd_data)
67+
68+
# renaming databases' columns
69+
gnomad_data.columns += "(gnomad)"
70+
clinvar_data.columns += "(clinvar)"
71+
72+
# Reading main working table
73+
main_frame = lovd_data["Variants_On_Transcripts"][0].copy()
74+
75+
# Merging Clinvar
76+
clinvar = clinvar_data.copy()[["Name(clinvar)",
77+
"Germline classification(clinvar)",
78+
"Accession(clinvar)"]]
79+
clinvar["VariantOnTranscript/DNA"] = (clinvar["Name(clinvar)"].
80+
apply(from_clinvar_name_to_cdna_position))
81+
82+
main_frame = pd.merge(main_frame,
83+
clinvar,
84+
how="outer",
85+
on=["VariantOnTranscript/DNA"]).drop("Name(clinvar)", axis=1)
86+
87+
# MERGING GnomAd
88+
main_frame = (pd.merge(main_frame,
89+
gnomad_data,
90+
how="left",
91+
left_on="VariantOnTranscript/DNA",
92+
right_on="HGVS Consequence(gnomad)").
93+
drop("HGVS Consequence(gnomad)",
94+
axis=1))
95+
96+
# Calculating frequencies
97+
lovd_without_association_in_gnomad = pd.isnull(main_frame["Hemizygote Count Remaining(gnomad)"])
98+
lovd_with_gnomad = main_frame[~lovd_without_association_in_gnomad].copy()
99+
max_values = lovd_with_gnomad.apply(calculate_max_frequency, axis=1)
100+
lovd_with_gnomad[['PopMax(gnomad)', 'PopMax population(gnomad)']] = max_values
101+
102+
# Leaving necessary columns
103+
lovd_with_gnomad = lovd_with_gnomad.loc[:, ['id',
104+
'transcriptid',
105+
'effectid',
106+
'position_c_start',
107+
'position_c_start_intron',
108+
'position_c_end',
109+
'position_c_end_intron',
110+
'VariantOnTranscript/DNA',
111+
'VariantOnTranscript/RNA',
112+
'VariantOnTranscript/Protein',
113+
'VariantOnTranscript/Exon',
114+
'Germline classification(clinvar)',
115+
'Accession(clinvar)',
116+
'Allele Frequency(gnomad)',
117+
'Homozygote Count(gnomad)',
118+
'PopMax(gnomad)',
119+
'PopMax population(gnomad)']]
120+
121+
# final join
122+
main_frame = main_frame.iloc[:, range(13)]
123+
main_frame = pd.merge(main_frame,
124+
lovd_with_gnomad,
125+
how="left",
126+
on=list(main_frame.columns[:13]))
127+
128+
main_frame.to_csv(DATA_PATH + "/final.csv")
129+
130+
131+
if __name__ == "__main__":
132+
main()

0 commit comments

Comments
 (0)