-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcommunityMatrixFormatter.py
80 lines (65 loc) · 2.43 KB
/
communityMatrixFormatter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
# Author: Michael Shamash, McGill University
# Script to generate abundance matrix (feature table) file with either reads or relative abundances
import csv
import argparse
# Construct the argument parser
ap = argparse.ArgumentParser()
ap.add_argument("-i", required=True, help="Input CSV file location")
ap.add_argument("--binary", action = "store_true", help="Use this flag to output a binary matrix (instead of normal relative abundance matrix), useful for calculating a Jaccard index")
ap.add_argument("--reads", action = "store_true", help="Use this flag to output a read count matrix (instead of normal relative abundance matrix), useful for rarefaction curves")
args = vars(ap.parse_args())
class Contig(object):
name = ""
date = ""
abund = 0.0
reads = 0
def __init__(self, name, date, abund, reads):
self.name = name
self.date = date
self.abund = abund
self.reads = reads
def make_contig(name, date, abund, reads):
contig = Contig(name, date, abund, reads)
return contig
contigs = []
samples = {}
line_count = 0
with open(args['i']) as csv_file:
csv_reader = csv.reader(csv_file, delimiter=',')
for row in csv_reader:
if line_count == 0:
line_count += 1
else:
date = row[0]
contigName = row[1]
contigAbund = row[2]
numberReads = row[8]
contigs.append(make_contig(contigName, date, contigAbund, numberReads))
samples[date] = {}
line_count += 1
print(f'Processed {line_count-1} contigs.')
contigNames = ["Sample"]
for contig in contigs:
if args['binary']:
if float(contig.abund) > 0:
samples[contig.date][contig.name] = 1
else:
samples[contig.date][contig.name] = 0
elif args['reads']:
samples[contig.date][contig.name] = contig.reads
else:
samples[contig.date][contig.name] = contig.abund
if contig.name not in contigNames:
contigNames.append(contig.name)
with open('matrix.csv', 'w') as csvfile:
writer = csv.DictWriter(csvfile, fieldnames=contigNames)
writer.writeheader()
for sample in samples:
row = {}
for contig in contigNames:
currentSample = samples[sample]
if contig == "Sample":
row["Sample"] = sample
else:
row[contig] = currentSample.get(contig,0)
writer.writerow(row)