-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrmlst.py
55 lines (41 loc) · 2.72 KB
/
rmlst.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import sys
from pythonmods import runblastn, runsubprocess, mlstfilter
rmlstdbpath=sys.argv[1]
threads=sys.argv[2]
outdir=sys.argv[3]
sequenceorigin=sys.argv[4]
sourcedir=sys.argv[5]
if sequenceorigin=='ncbi':
query='%s/accessions_filtered_deduplicated.fa'%outdir
else:
query=sys.argv[6]
cmdArgs=['find %s'%rmlstdbpath+' '+'-maxdepth 1 -mindepth 1 -name "*.n??" -printf "%f\n" | rev | cut -d"." -f2- | rev | sort | uniq >'+' '+'%s/rmlstloci.txt'%rmlstdbpath]
runsubprocess(cmdArgs,shell=True)
with open('%s/rmlstloci.txt'%rmlstdbpath) as f:
for line in f:
locus=line.strip()
database='%s/%s'%(rmlstdbpath,locus)
blastoutput='%s/rmlst/BLASTtable_%s.tsv' %(outdir,locus)
runblastn(query, database, blastoutput, num_threads='%s'%threads, perc_identity='95', evalue='1e-10',word_size='28')
print('runblastn finshed for locus %s'%locus)
#combining per-locus outputs prior to filtering
args=['find %s/rmlst/ -maxdepth 1 -mindepth 1 -type f -name "BLASTtable_*.tsv" ! -name "BLASTtable_combined*.tsv" -exec cat {} \; > %s/rmlst/BLASTtable_combined.tsv'%(outdir,outdir)]
runsubprocess(args,shell=True)
#filtering
blastoutput='%s/rmlst/BLASTtable_combined.tsv'%outdir
finalfile='%s/rmlst/BLASTtablebesthits.tsv'%outdir
sortedfile='%s/rmlst/BLASTtablesorted.tsv'%outdir
mlstfilter(blastoutput, finalfile, sortedfile, sourcedir, pidthresh=95, coveragethresh=0.95, formatcontigcol=False) #95% coverage/pid are the thresholds used by Larsen et al. 2014 "Benchmarking methods for genomic taxonomy"; and a high coverage threhsold is appropriate for complete (or near complete) genomes
print('mlstfilter finished')
#OLD CODE - !PROBLEM: there are 1239737 sequences in the combined database - would need to set max_target_seqs to infeasibly large cutoff; best to search databases individually (each database comprises ~ 30000 alleles)
#idtable='%s/idtable.tsv'%sys.argv[3] #don't need id tables
# database='%s/allallelesdb'%sys.argv[3]
# idtable='%s/idtable.tsv'%sys.argv[3]
# query='%s/output/%s_assemblies.fasta' %(sys.argv[1],sys.argv[2])
# blastoutput='%s/rmlst/output/%s_BLASTtable.tsv' %(sys.argv[1],sys.argv[2])
# runblastn(query, database, blastoutput, num_threads='%s'%sys.argv[4]) #running with default evalue and word size
# print('runblastn finshed')
# finalfile='%s/rmlst/output/%s_BLASTtablebesthits.tsv'%(sys.argv[1],sys.argv[2])
# sortedfile='%s/rmlst/output/%s_BLASTtablesorted.tsv'%(sys.argv[1],sys.argv[2])
# blastfilter(blastoutput, finalfile, sortedfile, idtable, pidthresh=95, coveragethresh=0.95) #this is the threshold use in larsen 2014, and a high coverage threhsold is appropriate for complete (or near complete) genomes !? - should also apply to mlst , resfinder etc.
# print('blastfilter finished')