-
Notifications
You must be signed in to change notification settings - Fork 0
/
sample_fastq.py
266 lines (205 loc) · 10.2 KB
/
sample_fastq.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
sample_fastq.py - Randomly sampling reads from fastq files
<https://github.com/miguelpmachado/randomScripts/>
Copyright (C) 2018 Miguel Machado <mpmachado@medicina.ulisboa.pt>
Last modified: January 03, 2018
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
import random
import argparse
import gzip
import bz2
import zipfile
import sys
import os
import shlex
import subprocess
from threading import Timer
version = '0.1'
def kill_subprocess_popen(subprocess_popen, command):
print('Command run out of time: ' + str(command))
subprocess_popen.kill()
def run_command_popen_communicate(command, shell_true, timeout_sec_none, print_comand_true):
run_successfully = False
if not isinstance(command, str):
command = ' '.join(command)
if print_comand_true:
print('Running: ' + command)
if shell_true:
proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True)
else:
command = shlex.split(command)
proc = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
not_killed_by_timer = True
if timeout_sec_none is None:
stdout, stderr = proc.communicate()
else:
timer = Timer(timeout_sec_none, kill_subprocess_popen, args=(proc, command,))
timer.start()
stdout, stderr = proc.communicate()
timer.cancel()
not_killed_by_timer = timer.isAlive()
if proc.returncode == 0:
run_successfully = True
else:
if not print_comand_true and not_killed_by_timer:
print('Running: ' + str(command))
if len(stdout) > 0:
print('STDOUT')
print(stdout.decode("utf-8"))
if len(stderr) > 0:
print('STDERR')
print(stderr.decode("utf-8"))
return run_successfully, stdout.decode("utf-8"), stderr.decode("utf-8")
def required_length(tuple_length_options, argument_name):
class RequiredLength(argparse.Action):
def __call__(self, parser, args, values, option_string=None):
if len(values) not in tuple_length_options:
msg = 'Option {argument_name} requires one of the following number of' \
' arguments: {tuple_length_options}'.format(argument_name=argument_name,
tuple_length_options=tuple_length_options)
raise argparse.ArgumentTypeError(msg)
setattr(args, self.dest, values)
return RequiredLength
def guess_file_compression(file_path):
magic_dict = {
b'\x1f\x8b\x08': 'gz',
b'\x42\x5a\x68': 'bz2',
b'\x50\x4b\x03\x04': 'zip'
}
max_len = max(len(x) for x in magic_dict)
with open(file_path, 'rb') as f:
file_start = f.read(max_len)
for magic, file_type in magic_dict.items():
if file_start.startswith(magic):
return file_type
return None
def open_fastq_files(fastq):
copen = {
'gz': gzip.open,
'bz2': bz2.open,
'zip': zipfile.ZipFile
}
file_objects = []
for fastq_files in fastq:
ftype = guess_file_compression(fastq_files)
if ftype is not None:
file_objects.append(copen[ftype](fastq_files, mode='rt'))
else:
file_objects.append(open(fastq_files))
return file_objects
def count_reads(fastq):
read_fastq_files = open_fastq_files(fastq)
total_records = []
for fastq_file in read_fastq_files:
total_records.append(int(sum([1 for _ in fastq_file]) / 4))
fastq_file.close()
if len(total_records) > 1 and len(set(total_records)) > 1:
sys.exit('\n'.join(['Different number of reads found in the provided fastq files'] +
['{fastq_file}: {n_reads} reads'.format(fastq_file=fastq_file, n_reads=total_records[x])
for x, fastq_file in enumerate(fastq)]))
return total_records[0]
def count_reads_gzip(fastq):
total_records = []
for fastq_file in fastq:
command = ['gunzip', '--keep', '--stdout', fastq_file, '|', 'awk', "\'{if(NR%4==1) print}\'", '|', 'wc', '-l']
run_successfully, stdout, stderr = run_command_popen_communicate(command, True, None, True)
if run_successfully:
total_records.append(int(stdout.split()[0]))
if len(total_records) == 0:
sys.exit('It was not possible to determine the number of reads')
if len(total_records) > 1 and len(set(total_records)) > 1:
sys.exit('\n'.join(['Different number of reads found in the provided fastq files'] +
['{fastq_file}: {n_reads} reads'.format(fastq_file=fastq_file, n_reads=total_records[x])
for x, fastq_file in enumerate(fastq)]))
return total_records[0]
def fastq_sampling(fastq, out_fastq, total_records, number):
read_fastq_files = open_fastq_files(fastq)
write_fastq_files = [gzip.open(fastq_file, 'wt') for fastq_file in out_fastq]
output_sequence_sets = set(random.sample(range(total_records + 1), number))
for x, reader in enumerate(read_fastq_files):
print('-' * 15)
writer = write_fastq_files[x]
record_number = 0
for line in reader:
line1 = line.strip()
line2 = reader.readline().strip()
line3 = reader.readline().strip()
line4 = reader.readline().strip()
if record_number in output_sequence_sets:
writer.write(line1 + '\n')
writer.write(line2 + '\n')
writer.write(line3 + '\n')
writer.write(line4 + '\n')
record_number += 1
if record_number % 500000 == 0:
print('{file}: {percentage}%'.format(file=fastq[x], percentage=round((record_number /
float(total_records)) * 100,
2)))
reader.close()
writer.close()
print('{file}: 100%'.format(file=fastq[x]))
def main():
if sys.version_info.major != 3:
sys.exit('\n' + '"sample_fastq.py" requires Python 3' + '\n' + 'Try running with "python3 sample_fastq.py"')
parser = argparse.ArgumentParser(prog='python3 sample_fastq.py',
description='Randomly sampling reads from fastq files',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--version', help='Version information', action='version', version=str('%(prog)s v' + version))
parser_required = parser.add_argument_group('Required options')
parser_required.add_argument('-f', '--fastq', nargs='+', action=required_length((1, 2), '--fastq'),
type=argparse.FileType('r'), metavar='/path/to/input/file.fq.gz',
help='Path to single OR paired-end fastq files. If two files are passed, they will be'
' assumed as being the paired fastq files. Can be used compressed (gzip, bzip2,'
' zip) or decompressed files.', required=True)
parser_required.add_argument('-o', '--outFastq', nargs='+', action=required_length((1, 2), '--outFastq'), type=str,
metavar='/path/to/output/file_downsampled.fq.gz',
help='Path to the downsampled fastq file gzip compressed. If paired-end fastq files'
' are provided, write the path to both downsampled fastq files in the same order'
' as the input fastq files.', required=True)
downsampling_options = parser.add_mutually_exclusive_group()
downsampling_options.add_argument('--fraction', type=float, metavar='0.1',
help='Fraction of reads to sample [0, 1]', required=False)
downsampling_options.add_argument('--number', type=int, metavar='1000', help='Number of reads to sample',
required=False)
args = parser.parse_args()
if len(args.fastq) != len(args.outFastq):
parser.error('Provide the same number of files with --fastq and --outFastq')
if args.fraction is None and args.number is None:
parser.error('Provide one of the following options: --fraction or --number')
args.outFastq = [os.path.abspath(outFastq) for outFastq in args.outFastq]
errors_found = []
for outFastq in args.outFastq:
if os.path.isfile(outFastq):
errors_found.append('{} file already exist'.format(outFastq))
else:
if not os.path.isdir(os.path.dirname(outFastq)):
errors_found.append('{} directory does not exist'.format(os.path.dirname(outFastq)))
if len(errors_found) > 0:
sys.exit('\n'.join(['Found some problems with output files'] + list(set(errors_found))))
args.fastq = [os.path.abspath(fastq.name) for fastq in args.fastq]
print('\n' + 'Counting reads...')
total_records = count_reads(args.fastq)
# total_records = count_reads_gzip(args.fastq)
print('{} reads found'.format(total_records))
if args.fraction is not None:
args.number = int(total_records * args.fraction)
print('{} reads will be sampled corresponding to {}%'.format(args.number, args.fraction * 100))
if args.number >= total_records:
sys.exit('The number of reads to subsample are equal or higher than the number of reads found')
fastq_sampling(args.fastq, args.outFastq, total_records, args.number)
print('-' * 15)
print('DONE')
if __name__ == "__main__":
main()