-
Notifications
You must be signed in to change notification settings - Fork 2
/
Gap
executable file
·277 lines (242 loc) · 10.7 KB
/
Gap
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
#!/usr/bin/env python3
import sys, time, argparse, subprocess, os.path, shutil, struct
Description = """Tool to build/merge the BWT and LCP array for a collection of sequences.
The tool can also compute the DA (document array)
There are two different usages depending on whether you already have the
BWT/LCP of the input files.
# Merge mode
If you already have all the BWTs/LCPs you must specify at least two input
basenames and use the option -o to specify the output basename.
For example
{exe} -o ABmerge fileA fileB
will merge the files fileA.bwt/fileA.2.lcp and fileB.bwt/fileB.2.lcp and
produce the output files: ABmerge.bwt, ABmerge.2.lcp. Use the option -l
to merge also the LCP arrays. For example
{exe} -l -o ABmerge fileA fileB
then the files fileA.2.lcp fileB.2.lcp must also be present and the output
will consists of the two files ABmerge.bwt and ABmerge.2.lcp
Globbing is accepted: multiple input file names can be denoted for example
as file?
# Compute from scratch mode
If you don't have the BWTs then your input must consists of a single file
with extension
.fasta/.fa (one input document per sequence)
.fastq (one input document per sequence)
.txt (one input document per line)
and it is not mandatory to specify the output basename. For example:
{exe} -l file.fasta
this will produce the output files: file.fasta.bwt, file.fasta.2.lcp
# Common Options
The option --lbytes specifies the number of bytes used for each LCP entry;
(1, 2, or 4). Note from the examples above that this number is always
part of the lcp file names: for example if option "--lbytes 4" is used
then all input and output LCP files must use 4 bytes per entry and have
the extension ".4.lcp". LCP values are stored in little endian format.
If option -d is used then also the Document Array is computed
and written to file with extension .da
All input and output files are uncompressed!
--------------------------
Command line options:
--------------------------
""".format(exe=sys.argv[0])
gsacak_exe = "tools/gsacak"
gsacak64_exe = "tools/gsacak-64"
gap_exe = "gap"
shasum_exe = "sha1sum"
def main():
parser = argparse.ArgumentParser(description=Description, formatter_class=argparse.RawTextHelpFormatter)
parser.add_argument('input', help='input file name(s)', type=str, nargs='+')
parser.add_argument('-o', '--out', help='output base name (def. input base name)', default="", type=str)
parser.add_argument('-l', '--lcp', help='compute LCP Array',action='store_true')
parser.add_argument('-d', '--da', help='compute Document Array',action='store_true')
parser.add_argument('--lbytes', help='bytes x LCP entry (def. 2)', default=2, type=int)
parser.add_argument('--sum', help='compute output files shasum',action='store_true')
parser.add_argument('--delete', help='delete output files (only with --sum)',action='store_true')
parser.add_argument('-1', '--phase1', help='stop after phase 1 (debug only)',action='store_true')
parser.add_argument('-v', help='verbose: extra info in the log file',action='store_true')
args = parser.parse_args()
# ---- check number of input files and define basename
check_input(args)
# ---- create and open log file
logfile_name = args.basename + ".Gap.log"
# get main Gap directory
args.egap_dir = os.path.split(sys.argv[0])[0]
print("Sending logging messages to file:", logfile_name)
with open(logfile_name,"a") as logfile:
print(">>> Begin computation",file=logfile)
show_command_line(logfile)
logfile.flush()
# ---- phase1: concat/compute BWTs
start0 = start = time.time()
if phase1(args,logfile,logfile_name)!=True:
sys.exit(1) # fatal error during phase 1
print("Elapsed time: {0:.4f}".format(time.time()-start))
if args.phase1:
print("Exiting after phase 1 as requested")
return
# ---- phase2: merging of BWTs and computation of LCP and DA arrays
start = time.time()
if phase2(args,logfile,logfile_name)!=True:
sys.exit(1) # fatal error during phase 2
print("Elapsed time: {0:.4f}".format(time.time()-start));
try:
os.remove(args.basename +".size") # delete size file no longer useful
except OSError as e: # if failed, report it back to the user and stop
print ("Error: %s - %s." % (e.filename,e.strerror))
sys.exit(1)
# ---- final report
elapsed = time.time()-start0
outsize = os.path.getsize(args.basename+".bwt")
musecbyte = elapsed*10**6/(outsize)
print("==== Done")
print("Total construction time: {0:.4f} usec/byte: {1:.4f}".format(elapsed,musecbyte))
# -------- compute hash sums using shasum_exe
if args.sum :
digest = file_digest(args.basename +".bwt",logfile)
print("BWT {exe}: {digest}".format(exe=shasum_exe, digest=digest))
if args.lcp:
digest = file_digest(args.basename+args.lcpext,logfile)
print("LCP {exe}: {digest}".format(exe=shasum_exe, digest=digest))
if args.da:
digest = file_digest(args.basename+".da",logfile)
print("DA {exe}: {digest}".format(exe=shasum_exe, digest=digest))
# -------- delete output files if required
if (args.sum and args.delete):
try:
os.remove(args.basename+".bwt")
if args.lcp:
os.remove(args.basename+args.lcpext)
if args.da:
os.remove(args.basename+".da")
except OSError as e:
# if failed, report it back to the user and stop
print ("Error: %s - %s." % (e.filename,e.strerror))
print(">>> End test", file=logfile);
return
# compute hash digest for a file
def file_digest(name,logfile):
try:
hash_command = "{exe} {infile}".format(exe=shasum_exe, infile=name)
hashsum = subprocess.check_output(hash_command.split(),stderr=logfile)
hashsum = hashsum.decode("utf-8").split()[0]
except:
hashsum = "Error!"
return hashsum
# check that all required files for merge mode exist
# return True if at least one file is missing
def files_bwt_lcp_missing(args):
missing = False
for name in args.input:
if not os.path.exists (name+".bwt"):
print(" Missing:", name + ".bwt")
missing = True
if args.lcp and not os.path.exists(name+args.lcpext):
print(" Missing:", name +args.lcpext)
missing = True
return missing
# check correctness of number of input file and define basename for output
def check_input(args):
# tests common to the two operation modes
if args.delete and not args.sum:
print("Option --delete can only be used with --sum")
sys.exit(1)
if args.lbytes!=1 and args.lbytes!=2 and args.lbytes!=4:
print("The number of bytes for LCP entry must be 1, 2 or 4")
sys.exit(1)
args.lcpext = ".{N}.lcp".format(N=args.lbytes)
# ---- if the inputs are bwts/lcps there must be at least 2 of them
if len(args.input)>1:
if len(args.out)==0:
print("Please use option -o to specify an output basename!")
sys.exit(1)
args.basename = args.out
# check existence of all input files
if files_bwt_lcp_missing(args):
print("The above input files are missing.")
print("Makes sure all input files exist and have the correct extension")
sys.exit(1)
# check file sizes
if args.lcp:
for name in args.input:
size = os.path.getsize(name+".bwt")
sizelcp = os.path.getsize(name+args.lcpext)
if(size*args.lbytes!=sizelcp):
print("BWT/LCP file sizes mismatch for input:",name)
sys.exit(1)
# ---- if the input are concatenated texts there is a single file
else:
if len(args.input)!=1:
print("You must supply a single file containing the concatenation of the input texts!")
sys.exit(1)
if len(args.out)==0: # specify basename for input files gap+merge
args.basename = args.input[0]
else:
args.basename = args.out
# phase1:
# concatenation or computation of bwts/lcps
def phase1(args,logfile, logfile_name):
if len(args.input)>1: # merge mode
print("==== creating .size file")
with open(args.basename+ ".size","wb") as sizefile:
for name in args.input:
size = os.path.getsize(name+".bwt")
sizefile.write(struct.pack('<Q',size))
print("==== concatenating BWT files")
with open(args.basename + ".bwt","wb") as bwtfile:
for name in args.input:
with open(name+".bwt",'rb') as fd:
shutil.copyfileobj(fd, bwtfile, 1024*1024*10) # 10 MBs buffer
if args.lcp:
print("==== concatenating LCP files")
with open(args.basename + args.lcpext,"wb") as lcpfile:
for name in args.input:
with open(name+args.lcpext,'rb') as fd:
shutil.copyfileobj(fd, lcpfile, 1024*1024*10) # 10 MBs buffer
return True # everything fine
else:
# ---- use gSACAK with unlimited memory
# shall we use gsaka or gsaka64?
size = os.path.getsize(args.input[0])
if(size<2**31): # note: the actual input could be smaller than this because of the fasta/etc information
exe = os.path.join(args.egap_dir,gsacak_exe)
else: # if in doubt use 64bit version
exe = os.path.join(args.egap_dir,gsacak64_exe)
# specify output base name
if len(args.out)==0: outopt=""
else: outopt="-o " + args.basename
if(args.lcp): options = "-g {lbytes}".format(lbytes = args.lbytes) # generate lcp
else: options = ""
command = "{exe} -b {options} {output} {ifile} 0".format(exe=exe,
options=options, output=outopt, ifile=args.input[0])
# execute choosen algorithm
print("==== gSACAK\n Command:", command)
return execute_command(command,logfile,logfile_name)
# phase2:
# merging of BWTs and computation of LCP and DA arrays
def phase2(args,logfile, logfile_name):
exe = os.path.join(args.egap_dir,gap_exe)
options = "-g256 -vaBT"
if(args.lcp): options += "r" # generate lcp
if(args.da): options += "d" # generate da
if(args.v): options += "v" # increase verbosity level
command = "{exe}{byts} {opts} {ibase}".format(exe=exe,
byts = args.lbytes, opts=options, ibase=args.basename)
print("==== gap\n Command:", command)
return execute_command(command,logfile,logfile_name)
# execute command: return True is everything OK, False otherwise
def execute_command(command,logfile,logfile_name):
try:
subprocess.check_call(command.split(),stdout=logfile,stderr=logfile)
except subprocess.CalledProcessError:
print("Error executing command line:")
print("\t"+ command)
print("Check log file: " + logfile_name)
return False
return True
def show_command_line(f):
f.write("Python command line: ")
for x in sys.argv:
f.write(x+" ")
f.write("\n")
if __name__ == '__main__':
main()