-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathprev_calc.py
148 lines (101 loc) · 6.67 KB
/
prev_calc.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
cluster1= ["NORMAL","NORMAL-NORMAL","OBESE-NORMAL","NORMAL","STOOL","STOOL","NORMAL","NORMAL","OBESE","OBESE"]
cluster2= ["OBESE","NORMAL-OBESE","OBESE-OBESE","OBESE","MECONIUM","MECONIUM","NORMAL-NORMAL","NORMAL-OBESE","OBESE-NORMAL","OBESE-OBESE"]
metadata_path= 'D:\Sample\MİKRO ANALİZ GRUPLANDIRMA.xlsx'
taxa_path= 'D:\Sample'
output_path= "D:\Sample\deneme\\"
def prevalence_calculator(metadata_path,taxa_path,output_path,cluster1, cluster2):
import pandas as pd
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows
import os
##Part1
"1.1: read metadata"
metadata=pd.read_excel(metadata_path,sheet_name='Sayfa1')
"1.2: discard unnecessay columns"
metadata=metadata.iloc[:,4::]
"""1.3: Separate data by groups w/ sample-id"""
groupnames=[]
for i in range(len(metadata.columns)-1):
groupnames.append('g'+str(i+1))
globals()[groupnames[i]]=metadata.iloc[:,[0,i+1]]
"""1.4: drop N/A values"""
for i in range(len(groupnames)):
globals()[groupnames[i]]=globals()[groupnames[i]].dropna()
"""1.5: cluster in groups as gn1&&gn2"""
counter0=0
groupnew=[]
exitname=[]
for grpnm in groupnames:
counter1=-1
counter2=-1
exitname.append(str(globals()[grpnm].columns[1]))
for i in range(len(globals()[grpnm].iloc[:,1])):
if globals()[grpnm].iloc[i,1]== cluster1[counter0]:
counter1 +=1
counter2 +=1
globals()[grpnm+"1"]=globals()[grpnm].iloc[0:counter1+1,::]
if globals()[grpnm].iloc[i,1]== cluster2[counter0]:
counter2 +=1
globals()[grpnm+"2"]=globals()[grpnm].iloc[counter1+1:counter2+1,::]
groupnew.append(grpnm+"1")
groupnew.append(grpnm+"2")
counter0 +=1
##Part2
"""2.1: reach working directory"""
os.chdir(taxa_path)
"""2.2: import multiple DFs"""
files=os.listdir(taxa_path)
files_xls = [f for f in files if f[-13:-5] == 'taxonomy']
for i in range(len(files_xls)):
globals()[files_xls[i]]=pd.read_excel(str(os.getcwd())+ "\\"+ files_xls[i])
##Part3
for grpnm in groupnew:
globals()[grpnm+'_df']=pd.DataFrame()
for i in range(len(globals()[grpnm])):
globals()[grpnm+'_df']=pd.concat([(globals()[grpnm+'_df']),
(globals()[globals()[grpnm].iloc[i,0]+'taxonomy.xlsx'])])
"""select ~% columns"""
for grpnm in groupnew:
globals()[grpnm+'_df']=globals()[grpnm+'_df'].iloc[:,1::2]
##Part4
"""4.1: get longest column"""
counter3=0
wb= Workbook()
for grpnm in groupnew:
globals()[grpnm+'_finaldf']=pd.DataFrame()
globals()[grpnm+'_free']=pd.DataFrame()
LP=len((globals()[grpnm+'_df']).pivot_table(index=['Phylum'], aggfunc='size').index)
LC=len((globals()[grpnm+'_df']).pivot_table(index=['Class'], aggfunc='size').index)
LO=len((globals()[grpnm+'_df']).pivot_table(index=['Order'], aggfunc='size').index)
LF=len((globals()[grpnm+'_df']).pivot_table(index=['Family'], aggfunc='size').index)
LG=len((globals()[grpnm+'_df']).pivot_table(index=['Genus'], aggfunc='size').index)
LS=len((globals()[grpnm+'_df']).pivot_table(index=['Species'], aggfunc='size').index)
max_len=max(LP,LC,LO,LF,LG,LS)
"""4.2: create DF w/ longest column"""
#get taxa && prev values then match w/ max_len
globals()[grpnm+'_finaldf']["Phylum"]=list((((globals()[grpnm+'_df'])).pivot_table(index=['Phylum'], aggfunc='size').sort_values(ascending=False)).index) + (['']*(max_len-LP))
globals()[grpnm+'_finaldf']["PrevPhylum"]=list((((globals()[grpnm+'_df'])).pivot_table(index=['Phylum'], aggfunc='size').sort_values(ascending=False)).values / len(globals()[grpnm])) + (['']*(max_len-LP))
globals()[grpnm+'_finaldf']["Class"]=list(((globals()[grpnm+'_df']).pivot_table(index=['Class'], aggfunc='size').sort_values(ascending=False)).index) + (['']*(max_len-LC))
globals()[grpnm+'_finaldf']["PrevClass"]=list(((globals()[grpnm+'_df']).pivot_table(index=['Class'], aggfunc='size').sort_values(ascending=False)).values / len(globals()[grpnm])) + (['']*(max_len-LC))
globals()[grpnm+'_finaldf']["Order"]=list(((globals()[grpnm+'_df']).pivot_table(index=['Order'], aggfunc='size').sort_values(ascending=False)).index) + (['']*(max_len-LO))
globals()[grpnm+'_finaldf']["PrevOrder"]=list(((globals()[grpnm+'_df']).pivot_table(index=['Order'], aggfunc='size').sort_values(ascending=False)).values / len(globals()[grpnm])) + (['']*(max_len-LO))
globals()[grpnm+'_finaldf']["Family"]=list(((globals()[grpnm+'_df']).pivot_table(index=['Family'], aggfunc='size').sort_values(ascending=False)).index) + (['']*(max_len-LF))
globals()[grpnm+'_finaldf']["PrevFamily"]=list(((globals()[grpnm+'_df']).pivot_table(index=['Family'], aggfunc='size').sort_values(ascending=False)).values / len(globals()[grpnm])) + (['']*(max_len-LF))
globals()[grpnm+'_finaldf']["Genus"]=list(((globals()[grpnm+'_df']).pivot_table(index=['Genus'], aggfunc='size').sort_values(ascending=False)).index) + (['']*(max_len-LG))
globals()[grpnm+'_finaldf']["PrevGenus"]=list(((globals()[grpnm+'_df']).pivot_table(index=['Genus'], aggfunc='size').sort_values(ascending=False)).values / len(globals()[grpnm])) + (['']*(max_len-LG))
globals()[grpnm+'_finaldf']["Species"]=list(((globals()[grpnm+'_df']).pivot_table(index=['Species'], aggfunc='size').sort_values(ascending=False)).index) + (['']*(max_len-LS))
globals()[grpnm+'_finaldf']["PrevSpecies"]=list(((globals()[grpnm+'_df']).pivot_table(index=['Species'], aggfunc='size').sort_values(ascending=False)).values / len(globals()[grpnm])) + (['']*(max_len-LS))
"""4.3: to Excel"""
if int(grpnm[-1:])%2 != 0 :
wb= Workbook()
ws0=wb.active
ws0.title= cluster1[counter3]
ws1 = wb.create_sheet(cluster2[counter3])
for r in dataframe_to_rows(globals()[grpnm+'_finaldf'], index=False, header=True):
ws0.append(r)
if int(grpnm[-1:])%2 == 0 :
for r in dataframe_to_rows(globals()[grpnm+'_finaldf'], index=False, header=True):
ws1.append(r)
wb.save(output_path + exitname[counter3] +".xlsx")
counter3 +=1
prevalence_calculator(metadata_path,taxa_path,output_path,cluster1, cluster2)