Skip to content

Commit 03b8943

Browse files
author
Dainius Kirsnauskas
committed
add __name__
1 parent a20b767 commit 03b8943

File tree

1 file changed

+77
-72
lines changed

1 file changed

+77
-72
lines changed

data_collection/pipeline.py

Lines changed: 77 additions & 72 deletions
Original file line numberDiff line numberDiff line change
@@ -47,75 +47,80 @@ def calculate_max_frequency(row):
4747
return pd.Series([max_freq, max_pop], index=['PopMax', 'PopMax population'])
4848

4949

50-
# MAIN
51-
# Download all data
52-
store_database_for_eys_gene('lovd', True)
53-
store_database_for_eys_gene('gnomad', True)
54-
store_database_for_eys_gene('clinvar', True)
55-
56-
# Read and convert data
57-
lovd_data = parse_lovd(LOVD_PATH + "/lovd_data.txt")
58-
gnomad_data = pd.read_csv(GNOMAD_PATH + "/gnomad_data.csv")
59-
clinvar_data = pd.read_csv(CLINVAR_PATH + "/clinvar_data.txt", sep='\t')
60-
61-
convert_lovd_to_datatype(lovd_data)
62-
63-
# renaming databases' columns
64-
gnomad_data.columns += "(gnomad)"
65-
clinvar_data.columns += "(clinvar)"
66-
67-
# Reading main working table
68-
main_frame = lovd_data["Variants_On_Transcripts"][0].copy()
69-
notes = lovd_data["Variants_On_Transcripts"][1][::]
70-
71-
# Merging Clinvar
72-
clinvar = clinvar_data.copy()[["Name(clinvar)",
73-
"Germline classification(clinvar)",
74-
"Accession(clinvar)"]]
75-
clinvar["VariantOnTranscript/DNA"] = (clinvar["Name(clinvar)"].
76-
apply(from_clinvar_name_to_cdna_position))
77-
78-
main_frame = pd.merge(main_frame,
79-
clinvar,
80-
how="outer",
81-
on=["VariantOnTranscript/DNA"]).drop("Name(clinvar)", axis=1)
82-
83-
# MERGING GnomAd
84-
main_frame = (pd.merge(main_frame,
85-
gnomad_data,
86-
how="left",
87-
left_on="VariantOnTranscript/DNA",
88-
right_on="HGVS Consequence(gnomad)").drop("HGVS Consequence(gnomad)",
89-
axis=1))
90-
91-
# Calculating frequencies
92-
lovd_without_association_in_gnomad = pd.isnull(main_frame["Hemizygote Count Remaining(gnomad)"])
93-
lovd_with_gnomad = main_frame[~lovd_without_association_in_gnomad].copy()
94-
max_values = lovd_with_gnomad.apply(calculate_max_frequency, axis=1)
95-
lovd_with_gnomad[['PopMax(gnomad)', 'PopMax population(gnomad)']] = max_values
96-
97-
# Leaving necessary columns
98-
99-
lovd_with_gnomad = lovd_with_gnomad.loc[:, ['id',
100-
'transcriptid',
101-
'effectid',
102-
'position_c_start',
103-
'position_c_start_intron',
104-
'position_c_end',
105-
'position_c_end_intron',
106-
'VariantOnTranscript/DNA',
107-
'VariantOnTranscript/RNA',
108-
'VariantOnTranscript/Protein',
109-
'VariantOnTranscript/Exon',
110-
'Germline classification(clinvar)',
111-
'Accession(clinvar)',
112-
'Allele Frequency(gnomad)',
113-
'Homozygote Count(gnomad)',
114-
'PopMax(gnomad)',
115-
'PopMax population(gnomad)']]
116-
117-
# final join
118-
main_frame = main_frame.iloc[:, range(13)]
119-
main_frame = pd.merge(main_frame, lovd_with_gnomad, how="left", on=list(main_frame.columns[:13]))
120-
121-
main_frame.to_csv(DATA_PATH + "/final.csv")
50+
def main():
51+
# MAIN
52+
# Download all data
53+
store_database_for_eys_gene('lovd', True)
54+
store_database_for_eys_gene('gnomad', True)
55+
store_database_for_eys_gene('clinvar', True)
56+
57+
# Read and convert data
58+
lovd_data = parse_lovd(LOVD_PATH + "/lovd_data.txt")
59+
gnomad_data = pd.read_csv(GNOMAD_PATH + "/gnomad_data.csv")
60+
clinvar_data = pd.read_csv(CLINVAR_PATH + "/clinvar_data.txt", sep='\t')
61+
62+
convert_lovd_to_datatype(lovd_data)
63+
64+
# renaming databases' columns
65+
gnomad_data.columns += "(gnomad)"
66+
clinvar_data.columns += "(clinvar)"
67+
68+
# Reading main working table
69+
main_frame = lovd_data["Variants_On_Transcripts"][0].copy()
70+
notes = lovd_data["Variants_On_Transcripts"][1][::]
71+
72+
# Merging Clinvar
73+
clinvar = clinvar_data.copy()[["Name(clinvar)",
74+
"Germline classification(clinvar)",
75+
"Accession(clinvar)"]]
76+
clinvar["VariantOnTranscript/DNA"] = (clinvar["Name(clinvar)"].
77+
apply(from_clinvar_name_to_cdna_position))
78+
79+
main_frame = pd.merge(main_frame,
80+
clinvar,
81+
how="outer",
82+
on=["VariantOnTranscript/DNA"]).drop("Name(clinvar)", axis=1)
83+
84+
# MERGING GnomAd
85+
main_frame = (pd.merge(main_frame,
86+
gnomad_data,
87+
how="left",
88+
left_on="VariantOnTranscript/DNA",
89+
right_on="HGVS Consequence(gnomad)").drop("HGVS Consequence(gnomad)",
90+
axis=1))
91+
92+
# Calculating frequencies
93+
lovd_without_association_in_gnomad = pd.isnull(main_frame["Hemizygote Count Remaining(gnomad)"])
94+
lovd_with_gnomad = main_frame[~lovd_without_association_in_gnomad].copy()
95+
max_values = lovd_with_gnomad.apply(calculate_max_frequency, axis=1)
96+
lovd_with_gnomad[['PopMax(gnomad)', 'PopMax population(gnomad)']] = max_values
97+
98+
# Leaving necessary columns
99+
100+
lovd_with_gnomad = lovd_with_gnomad.loc[:, ['id',
101+
'transcriptid',
102+
'effectid',
103+
'position_c_start',
104+
'position_c_start_intron',
105+
'position_c_end',
106+
'position_c_end_intron',
107+
'VariantOnTranscript/DNA',
108+
'VariantOnTranscript/RNA',
109+
'VariantOnTranscript/Protein',
110+
'VariantOnTranscript/Exon',
111+
'Germline classification(clinvar)',
112+
'Accession(clinvar)',
113+
'Allele Frequency(gnomad)',
114+
'Homozygote Count(gnomad)',
115+
'PopMax(gnomad)',
116+
'PopMax population(gnomad)']]
117+
118+
# final join
119+
main_frame = main_frame.iloc[:, range(13)]
120+
main_frame = pd.merge(main_frame, lovd_with_gnomad, how="left", on=list(main_frame.columns[:13]))
121+
122+
main_frame.to_csv(DATA_PATH + "/final.csv")
123+
124+
125+
if __name__ == "__main__":
126+
main()

0 commit comments

Comments
 (0)