-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathhost_depletion.py
194 lines (167 loc) · 7.48 KB
/
host_depletion.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
import numpy as np
import random
import os
import sys, getopt
import torch
import torch.nn as nn
torch.backends.cudnn.enabled=False
from helper import *
from gpu_usage import *
from constants import *
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
# sets up the environment such that AMAISE uses one GPU. Change this to increase the number of
# GPUs that AMAISE uses
os.environ["CUDA_VISIBLE_DEVICES"]="0"
os.environ["OMP_NUM_THREADS"] = "16"
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":16:8"
torch.set_num_threads(16)
# ensure reproducibility
seed = 0
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.cuda.manual_seed(seed)
np.random.seed(seed)
random.seed(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
# Outputs to help user use AMAISE
try:
opts, args = getopt.getopt(sys.argv[1:], 'i:t:o:')
except getopt.GetoptError:
print('host_depletion.py -i <inputfile> -t <typefile> -o <outfolder>')
sys.exit(2)
if len(opts) != 3:
print('host_depletion.py -i <inputfile> -t <typefile> -o <outfolder>')
sys.exit()
'''
User Inputs:
inputfile: fasta/fastq file to classify
typefile: 'fastq' if inputfile is a fastq files, and 'fasta' if inputfile is a fasta file
outfolder: folder to write output files to
'''
for opt, arg in opts:
if opt == '-h':
print('host_depletion.py -i <inputfile> -t <typefile> -o <outfolder>')
sys.exit()
elif opt in ("-i", "--inputfile"):
testfile = arg
elif opt in ("-t", "--typefile"):
typefile = arg
if typefile not in ['fastq', 'fasta']:
print('-t argument: fastq, fasta')
elif opt in ("-o", "--outfolder"):
outfolder = arg
# Create temp_lenfiles folder
if not os.path.exists('temp_lenfiles'):
os.makedirs('temp_lenfiles')
# Create outfolder folder
if not os.path.exists('%s'%outfolder):
os.makedirs('%s'%outfolder)
# Delete existing information in temp_lenfiles
os.system('rm -f temp_lenfiles/*')
# Delete existing output of AMAISE
os.system('rm -f %s/mlpaths.%s'%(outfolder, typefile))
# Open file to write probabilities that each input sequence is from a host
outputwritefile = open("%s/mlprobs.txt"%(outfolder), 'w')
outputwritefile.write('id, classification label (0 for microbe, 1 for host), length\n')
# Open file to write microbial sequences to
g = open("%s/mlpaths.%s"%(outfolder, typefile), "w")
# If we are tracking GPU usage, open file to track GPU information
if track_gpu == True:
gpufile = open('%s/gpuusage.txt'%outfolder, 'w')
# Load AMAISE onto GPUs
model = TCN()
model = nn.DataParallel(model)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)
model.load_state_dict(torch.load(modelpath))
model.eval()
# Get length information to bin input sequences by length
orig_filelens, all_filelens = create_filelens(typefile, testfile)
# Iterate through all bins
for filelens in all_filelens:
# Delete text files in temp_lenfiles
os.system('rm -f temp_lenfiles/*')
# Initialize final_size that keeps track of the number of sequences in each length bin
# that have been classified
final_size = []
for i in filelens:
final_size.append(0)
# Write sequences that are in the current length bins to "temp_lenfiles"
sort_seqs(orig_filelens, typefile, testfile, filelens, final_size)
# Iterate through the text files in "temp_lenfiles" and classify the sequences
with torch.no_grad():
for len_idx, len_ in enumerate(filelens):
i = 0
j = 0
ids, seqs, lens, y_pred_oh, quals = [], [], [], [], []
final_len = len_
# Truncate all sequence lengths to "max_len"
if final_len >= max_len:
final_len = max_len
# Calculate batch size for testing
batch_size = int(batch_max_len/final_len)
temp_batch_size = batch_size
if os.path.exists('temp_lenfiles/len%d.txt'%len_):
with open('temp_lenfiles/len%d.txt'%len_) as f:
for line in f:
# Classify sequences
if j == temp_batch_size:
X = torch.tensor(X).float().to(device)
with torch.cuda.amp.autocast():
y_pred_oh = torch.sigmoid(model(X)).detach().cpu().numpy()[:, 1]
if track_gpu == True:
gpu_usage(gpufile)
del X
torch.cuda.empty_cache()
if typefile == 'fasta':
quals = [0]*len(seqs)
# Write classifications/ microbial sequences to the appropriate files
for id_, pred_, seq_, len_, qual_ in zip(ids, y_pred_oh, seqs, lens, quals):
write_output(typefile, outputwritefile, g, id_, seq_, len_, pred_, qual_)
ids, seqs, y_pred_oh, lens, quals = [], [], [], [], []
# Initialize current batch
if i % batch_size == 0:
j = 0
temp_batch_size = batch_size
if i + batch_size > final_size[len_idx]:
temp_batch_size = final_size[len_idx] - i
X = np.zeros((temp_batch_size, final_len, 4))
# Read in information from text files and write it to appropriate variables
id_ = line.split(', ')[0]
seq_ = line.split(', ')[1]
ids.append(id_)
seqs.append(seq_)
lens.append(final_len)
if typefile == 'fastq':
qual_ = line.split(', ')[2].split('\n')[0]
quals.append(qual_)
# Update array used to store sequence information
X[j, :, :] = np.asarray(generate_long_sequences(seq_[:final_len]), dtype='uint8')
i += 1
j += 1
# Classify sequences in final batch
if i > 0 or j > 0:
X = torch.tensor(X).float().to(device)
with torch.cuda.amp.autocast():
y_pred_oh = torch.sigmoid(model(X)).detach().cpu().numpy()[:, 1]
if track_gpu == True:
gpu_usage(gpufile)
del X
torch.cuda.empty_cache()
if typefile == 'fasta':
quals = [0]*len(seqs)
# Write classifications/ microbial sequences to the appropriate files
for id_, pred_, seq_, len_, qual_ in zip(ids, y_pred_oh, seqs, lens, quals):
write_output(typefile, outputwritefile, g, id_, seq_, len_, pred_, qual_)
ids, seqs, lens, y_pred_oh, quals = [], [], [], [], []
# Delete text file we just iterated through in temp_lenfiles
if os.path.exists('temp_lenfiles/len%d.txt'%len_):
os.system('rm -f temp_lenfiles/len%d.txt'%len_)
# Close all opened files
outputwritefile.close()
g.close()
if track_gpu == True:
gpufile.close()
# Delete existing information in temp_lenfiles
os.system('rm -f temp_lenfiles/*')