-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathCheck_all_loci_for_data_completeness.py
63 lines (51 loc) · 1.8 KB
/
Check_all_loci_for_data_completeness.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import os
import pandas as pd
import sys
def read_file(filepath,file):
locus = file[:-4] #remove csv
f = open(filepath,"r")
var_to_stats = {}
counter = 0
for line in f:
line = line[:-1]
if counter > 0:
cells = line.split(",")
variety = cells[0]
base_counter = 0
good_snps = 0
for cell in cells[1:]:
base_counter += 1
if cell == "A" or cell == "T" or cell == "G" or cell == "C":
good_snps += 1
df.loc[locus, "locus"] = locus
#print(variety,good_snps,base_counter)
df.loc[locus,variety] = good_snps / base_counter
counter +=1
file_counter = 0
folder = sys.argv[1]
output_file = sys.argv[2]
#folder = "D:/Git/RiceDatabases/Genes-3K-Base-CSV-20211130T093248Z-001/Genes-3K-Base-CSV/MSUgene-Base3K-chr1/1"
for root, dirs, files in os.walk(folder):
for file in files:
filepath = os.path.join(root, file)
#First file, build data frame
if file_counter == 0:
all_varieties = ["locus"]
f_first = open(filepath,"r")
counter = 0
for line in f_first:
line = line[:-1]
if counter > 0:
cells = line.split(",")
variety = cells[0]
all_varieties.append(variety)
counter +=1
df = pd.DataFrame(columns=all_varieties)
df.set_index("locus")
file_counter+=1
if file_counter % 50 == 0:
print("file count",file_counter)
read_file(filepath,file)
#print(df)
df.to_csv(output_file,index=False)
#df.to_csv("D:/Dropbox/DocStore/ProteomicsSoftware/PTMExchange/Rice_build/profile_loci/profiled_loci.csv",index=False)