-
Notifications
You must be signed in to change notification settings - Fork 5
/
filter_fastq.py
executable file
·164 lines (133 loc) · 5.65 KB
/
filter_fastq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
#!/usr/bin/env python
# filter_fastq.py
# filter sequence ids by identifiers in a file.
# SNF March 2016
import sys, argparse, re, gzip, os, threading, time
from itertools import islice, izip
import multiprocessing
parser = argparse.ArgumentParser(description="Filter fastq file")
parser.add_argument("-i", "--input", help="File to filter unpaired reads from (default stdin)", default="stdin")
parser.add_argument("-1", "--read1", help="Paired-end fastq file one")
parser.add_argument("-2", "--read2", help="Paired-end fastq file two")
parser.add_argument("-p", "--num-threads", help="Number of threads to start (currently for PE reads only)", type=int)
parser.add_argument("-o", "--output", help="Output filename (default stdout; interpreted as a basename for PE reads)", default="stdout")
parser.add_argument("-f", "--filter_file", help="File containing strings to include based on IDs in fastq")
parser.add_argument("-v", "--invert", help="Invert match (exclude matching entries)", action="store_true")
parser.add_argument("--gzip", help="gzip compress the output", action="store_true")
args = parser.parse_args()
if args.input != "stdin" and (args.read1 or args.read2):
sys.exit("Fatal: please provide only one of --input or --read1/--read2 but not both")
if (args.read1 and not args.read2) or (args.read2 and not args.read1):
sys.exit("Fatal: both --read1 and --read2 must be provided when processing paired-end reads")
if args.read1: # reads are paired now if this is true
paired = True
else:
paired = False
if not paired:
if (args.output == "stdout" or args.output == "-"):
outfile = sys.stdout
else:
if args.gzip:
outfile = gzip.open(args.output, "wb")
else:
outfile = open(args.output, "w")
if (args.input == "stdin" or args.input == "-"):
infile = sys.stdin
else:
filename, file_extension = os.path.splitext(args.input)
if file_extension == ".gz":
infile = gzip.open(args.input, "rb")
elif file_extension == ".fastq":
infile = open(args.input, "r")
else:
sys.exit("Fatal: unknown file extension for filename %s" % args.input)
else: # paired
filename, file_extension = os.path.splitext(args.read1)
if file_extension == ".gz":
infile_read1 = gzip.open(args.read1, "rb")
elif file_extension == ".fastq":
infile_read1 = open(args.read1, "r")
else:
sys.exit("Fatal: unknown file extension for filename %s" % args.read1)
filename, file_extension = os.path.splitext(args.read2)
if file_extension == ".gz":
infile_read2 = gzip.open(args.read2, "rb")
elif file_extension == ".fastq":
infile_read2 = open(args.read2, "r")
else:
sys.exit("Fatal: unknown file extension for filename %s" % args.read2)
if args.gzip:
outfile_read1 = gzip.open(args.output + ".1.fastq.gz", "wb")
outfile_read2 = gzip.open(args.output + ".2.fastq.gz", "wb")
else:
outfile_read1 = open(args.output + ".1.fastq", "w")
outfile_read2 = open(args.output + ".2.fastq", "w")
sys.stderr.write("Filtering reads from paired files %s and %s.\n" % (args.read1, args.read2))
filter_list = [i.strip() for i in open(args.filter_file)]
sys.stderr.write("Read %d identifiers to filter.\n" % len(filter_list))
# note this function uses filter_list as defined globally earlier. can't pass it in b/c need one argument for pool.imap
def filter_pair(lines):
# lines is a set of four lines from both files that are stitched together. Order is:
# file1line1, file2line1, file1line2, file2line2, etc...
seqid1 = lines[0].split()[0]
seqid2 = lines[1].split()[0]
if seqid1 != seqid2:
print "WARNING: out of sync pairs detected with ids %s %s" % (seqid1, seqid2)
if args.invert:
if seqid1 in filter_list:
#print "Filtering ID r1 %s" % seqid1
return False
else:
return lines
else:
if seqid1 in filter_list:
#print "Keeping ID r1 %s" % seqid1
return lines
else:
return False
def filter_read(lines):
# lines is a set of four lines from the file.
seqid = lines[0].split()[0]
if args.invert:
if seqid in filter_list:
#print "Filtering ID r1 %s" % seqid1
return False
else:
return lines
else:
if seqid in filter_list:
#print "Keeping ID r1 %s" % seqid1
return lines
else:
return False
n_reads = 0
n_filtered = 0
pool = multiprocessing.Pool(processes=args.num_threads)
if paired:
# the izip below makes an iterator that loops over both files in sets of four. should be parallel-safe
res = pool.imap(filter_pair, izip(*[infile_read1, infile_read2]*4))
else:
# one-file iterator
res = pool.imap(filter_read, izip(*[infile]*4))
while 1:
if not n_reads % 1000000:
sys.stderr.write("Processed %d reads.\n" % n_reads)
try:
lines = res.next()
n_reads += 1
if lines:
if paired:
outfile_read1.write(lines[0] + lines[2] + lines[4] + lines[6])
outfile_read2.write(lines[1] + lines[3] + lines[5] + lines[7])
else:
outfile.write("".join(lines))
if not args.invert:
n_filtered += 1
else: # don't output this
if args.invert:
n_filtered += 1
except StopIteration:
break
pool.close()
pool.join()
sys.stderr.write("Processed %d reads, filtered %d reads. (%.2f%%).\n" % (n_reads, n_filtered, n_filtered/float(n_reads)))