Skip to content
This repository was archived by the owner on Nov 9, 2023. It is now read-only.

Commit 53ff511

Browse files
author
Greg Caporaso
committed
Merge pull request #1806 from jairideout/all_multiple_file_command_generatorsNov132014
Batch mode scripts
2 parents 44138d6 + f15338d commit 53ff511

39 files changed

+1468
-0
lines changed

ChangeLog.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -99,6 +99,7 @@ optionally a mapping file. Check out the new documentation for the naming conven
9999
* Removed ``make_otu_heatmap_html.py`` in favor of ``make_otu_heatmap.py`` (see discussion on [#1724](https://github.com/biocore/qiime/issues/1724)).
100100
* Fixed bug that resulted in samples being mislabeled in ``make_otu_heatmap.py`` when one of the following options was passed: ``--category``, ``--map_fname``, ``--sample_tree``, or ``--suppress_column_clustering``. This is discussed in [#1790](https://github.com/biocore/qiime/issues/1790).
101101
* Added ``--negate_sample_id_fp`` option to ``filter_samples_from_otu_table.py`` (see [#1117](https://github.com/biocore/qiime/issues/1117)).
102+
* Added three new workflow scripts for facilitating initial QIIME processing of already-demultiplexed fastq files, as these are commonly being provided by sequencing centers. These are: ``multiple_split_libraries_fastq.py``, ``multiple_join_paired_ends.py``, and ``multiple_extract_barcodes.py``.
102103

103104
QIIME 1.8.0 (11 Dec 2013)
104105
=========================

qiime/workflow/preprocess.py

Lines changed: 286 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,286 @@
1+
from __future__ import division
2+
3+
__author__ = "William Walters"
4+
__copyright__ = "Copyright 2011, The QIIME Project"
5+
__credits__ = ["William Walters"]
6+
__license__ = "GPL"
7+
__version__ = "1.8.0-dev"
8+
__maintainer__ = "William Walters"
9+
__email__ = "William.A.Walters@colorado.edu"
10+
11+
from os.path import join, basename, splitext
12+
13+
def create_commands_jpe(pairs, base_output_dir, optional_params = "",
14+
leading_text = "", trailing_text = "", include_input_dir_path=False,
15+
remove_filepath_in_name=False, match_barcodes = False,
16+
bc_pairs = {}):
17+
""" Creates commands for join_paired_ends.py
18+
19+
pairs: dictionary of forward:reverse read filepaths
20+
base_output_dir: output directory to write log, stitched reads
21+
optional_params: added parameters to join_paired_ends.py calls
22+
leading_text: Text to add before join_paired_ends.py call
23+
trailing_text: Text to add after join_paired_ends.py call
24+
include_input_dir_path: If True, include input directory in output
25+
directory names
26+
remove_filepath_in_name: If True, the base filename will not be used in the
27+
output directory names.
28+
match_barcodes: True to match barcodes.
29+
bc_pairs: dictionary of read1:bc_read filepaths (empty if not used)
30+
"""
31+
32+
commands = []
33+
extensions = ['.fastq.gz', '.fastq', '.fq.gz', '.fq']
34+
35+
for curr_fp in pairs:
36+
for extension in extensions:
37+
if extension in curr_fp:
38+
curr_ext = extension
39+
if include_input_dir_path:
40+
added_output_str = curr_fp.split('/')[-2]
41+
else:
42+
added_output_str = ""
43+
if not remove_filepath_in_name:
44+
added_output_str += basename(curr_fp).split(curr_ext)[0]
45+
46+
47+
curr_outputdir = join(base_output_dir, added_output_str)
48+
if match_barcodes:
49+
command = "%sjoin_paired_ends.py %s -b %s -f %s -r %s -o %s %s" %\
50+
(_clean_leading_text(leading_text), optional_params, bc_pairs[curr_fp], curr_fp,
51+
pairs[curr_fp], curr_outputdir, trailing_text)
52+
else:
53+
command = "%sjoin_paired_ends.py %s -f %s -r %s -o %s %s" %\
54+
(_clean_leading_text(leading_text), optional_params, curr_fp, pairs[curr_fp],
55+
curr_outputdir, trailing_text)
56+
57+
commands.append([('join_paired_ends.py: %s' % curr_fp, command)])
58+
59+
return commands
60+
61+
def create_commands_eb(all_files, ispaired, base_output_dir,
62+
optional_params = "", leading_text = "", trailing_text = "",
63+
include_input_dir_path=False, remove_filepath_in_name=False):
64+
""" Creates commands for extract_barcodes.py
65+
66+
all_files: list of input filelpaths or dict of paired files
67+
ispaired: True if paired data
68+
base_output_dir: output directory to write log, stitched reads
69+
optional_params: added parameters to extract_barcodes.py calls
70+
leading_text: Text to add before extract_barcodes.py call
71+
trailing_text: Text to add after extract_barcodes.py call
72+
include_input_dir_path: If True, include input directory in output
73+
directory names
74+
remove_filepath_in_name: If True, the base filename will not be used in the
75+
output directory names.
76+
"""
77+
78+
commands = []
79+
extensions = ['.fastq.gz', '.fastq', '.fq.gz', '.fq']
80+
81+
for curr_fp in all_files:
82+
if include_input_dir_path:
83+
added_output_str = curr_fp.split('/')[-2]
84+
else:
85+
added_output_str = ""
86+
if not remove_filepath_in_name:
87+
for extension in extensions:
88+
if extension in curr_fp:
89+
curr_ext = extension
90+
added_output_str += basename(curr_fp).split(curr_ext)[0]
91+
92+
curr_outputdir = join(base_output_dir, added_output_str)
93+
if ispaired:
94+
command = "%sextract_barcodes.py %s -f %s -r %s -o %s %s" %\
95+
(_clean_leading_text(leading_text), optional_params, curr_fp, all_files[curr_fp],
96+
curr_outputdir, trailing_text)
97+
else:
98+
command = "%sextract_barcodes.py %s -f %s -o %s %s" %\
99+
(_clean_leading_text(leading_text), optional_params, curr_fp,
100+
curr_outputdir, trailing_text)
101+
102+
commands.append([('extract_barcodes.py: %s' % curr_fp, command)])
103+
104+
return commands
105+
106+
def create_commands_slf(all_files, demultiplexing_method, output_dir,
107+
params = "", leading_text = "", trailing_text = "",
108+
include_input_dir_path=False, remove_filepath_in_name=False,
109+
sampleid_indicator = "_"):
110+
""" Creates command for split_libraries_fastq.py
111+
112+
all_files: list of input filelpaths or dict of reads:(barcode,mapping)
113+
demultiplexing_method: Either 'sampleid_by_file' or 'mapping_barcode_files'
114+
output_dir: output directory to write split_libraries_fastq output
115+
params: added parameters to split_libraries_fastq.py calls
116+
leading_text: Text to add before split_libraries_fastq.py call
117+
trailing_text: Text to add after split_libraries_fastq.py call
118+
include_input_dir_path: If True, include input directory in output
119+
directory names
120+
remove_filepath_in_name: If True, the base filename will not be used in the
121+
output directory names.
122+
sampleid_indicator: Split on this character in input fastq filenames to
123+
generate output SampleID name.
124+
"""
125+
126+
commands = []
127+
read_files = []
128+
barcode_files = []
129+
mapping_files = []
130+
sample_ids = []
131+
132+
# Using a set in this case to keep consistent order (needed for unit tests)
133+
all_fps = set(all_files)
134+
135+
for curr_fp in all_fps:
136+
read_files.append(curr_fp)
137+
# Just need to build up a list of SampleID names
138+
if demultiplexing_method == 'sampleid_by_file':
139+
if include_input_dir_path:
140+
sample_id = curr_fp.split('/')[-2]
141+
else:
142+
sample_id = ""
143+
if not remove_filepath_in_name:
144+
sample_id += basename(curr_fp).split(sampleid_indicator)[0]
145+
sample_ids.append(sample_id)
146+
# Need list of barcode filepaths, mapping filepaths
147+
else:
148+
barcode_files.append(all_files[curr_fp][0])
149+
mapping_files.append(all_files[curr_fp][1])
150+
151+
if demultiplexing_method == 'sampleid_by_file':
152+
command =\
153+
"%ssplit_libraries_fastq.py %s -i %s --sample_ids %s -o %s %s --barcode_type 'not-barcoded'" %\
154+
(_clean_leading_text(leading_text), params, ",".join(read_files), ",".join(sample_ids),
155+
output_dir, trailing_text)
156+
else:
157+
command =\
158+
"%ssplit_libraries_fastq.py %s -i %s --barcode_read_fps %s --mapping_fps %s -o %s %s" %\
159+
(_clean_leading_text(leading_text), params, ",".join(read_files),
160+
",".join(barcode_files), ",".join(mapping_files),
161+
output_dir, trailing_text)
162+
163+
commands.append([('split_libraries_fastq.py', command)])
164+
165+
return commands
166+
167+
def get_pairs(all_files, read1_indicator, read2_indicator, match_barcodes=False,
168+
barcode_indicator="_I1_"):
169+
""" Finds pairs of files from a list of files, optionally matches barcodes
170+
171+
all_files: list of filepaths
172+
read1_indicator: string indicating read 1 of a pair
173+
read2_indicator: string indicating read 2 of a pair
174+
match_barcodes: If True, will attempt to match up barcodes file
175+
barcode_indicator: string indicating barcode file.
176+
"""
177+
178+
pairs = {}
179+
bc_pairs = {}
180+
181+
read1_files = []
182+
read2_files = []
183+
bc_files = []
184+
185+
for curr_file in all_files:
186+
curr_file_string_r1 = curr_file.split(read1_indicator)
187+
curr_file_string_r2 = curr_file.split(read2_indicator)
188+
if match_barcodes:
189+
curr_file_string_bc = curr_file.split(barcode_indicator)
190+
191+
if len(curr_file_string_r1) == 2:
192+
read1_files.append(curr_file_string_r1)
193+
elif len(curr_file_string_r2) == 2:
194+
read2_files.append(curr_file_string_r2)
195+
elif match_barcodes and len(curr_file_string_bc) == 2:
196+
bc_files.append(curr_file_string_bc)
197+
else:
198+
raise ValueError,("Invalid filename found for splitting on input "+\
199+
"for file %s, " % curr_file + "check input read1_indicator "+\
200+
"and read2_indicator parameters as well.")
201+
202+
for curr_read1 in read1_files:
203+
for curr_read2 in read2_files:
204+
if curr_read1 == curr_read2:
205+
pairs[read1_indicator.join(curr_read1)] =\
206+
read2_indicator.join(curr_read2)
207+
208+
if match_barcodes:
209+
for curr_read1 in read1_files:
210+
for curr_bc in bc_files:
211+
if curr_read1 == curr_bc:
212+
bc_pairs[read1_indicator.join(curr_read1)] =\
213+
barcode_indicator.join(curr_bc)
214+
# Need a specific test if matched barcodes are used-the barcodes should
215+
# match both the forward and reverse reads.
216+
forward_reads = set(pairs.keys())
217+
bc_reads = set(bc_pairs.keys())
218+
non_matching_f_reads = forward_reads - bc_reads
219+
if non_matching_f_reads:
220+
raise ValueError,("Found forward reads without matching barcodes "
221+
"file: %s" % non_matching_f_reads)
222+
223+
return pairs, bc_pairs
224+
225+
def get_matching_files(all_fastq, all_mapping,
226+
read_indicator, barcode_indicator, mapping_indicator):
227+
""" Matches up read, barcode, and mapping files based on filenames
228+
229+
all_fastq: list of sequence filepaths
230+
all_mapping: list of mapping filepaths
231+
read_indicator: string indicating read file
232+
barcode_indicator: string indicating barcode file
233+
mapping_indicator: string indicating mapping file
234+
"""
235+
236+
read_files = []
237+
barcode_files = []
238+
mapping_files = {}
239+
matching_files = {}
240+
241+
# Have to assume trailing text will not match extensions, so have to
242+
# do some splitting at the extension point to match up.
243+
for curr_file in all_mapping:
244+
try:
245+
curr_mapping = curr_file.split(mapping_indicator)
246+
mapping_files[curr_mapping[0] +
247+
splitext(curr_mapping[1])[0]] = curr_file
248+
except IndexError:
249+
raise IndexError(
250+
"Found file with a mapping file extension that does not "
251+
"contain the mapping file indicators (see mapping_indicator): "
252+
"%s" % curr_file)
253+
254+
for curr_file in all_fastq:
255+
curr_file_string_read = curr_file.split(read_indicator)
256+
curr_file_string_bc = curr_file.split(barcode_indicator)
257+
258+
if len(curr_file_string_read) == 2:
259+
read_files.append(curr_file_string_read)
260+
elif len(curr_file_string_bc) == 2:
261+
barcode_files.append(curr_file_string_bc)
262+
else:
263+
raise ValueError("Invalid filename found for splitting on input "+\
264+
"for file %s, " % curr_file + "check input read indicator "+\
265+
"and barcode indicator parameters.")
266+
267+
for curr_read in read_files:
268+
for curr_bc in barcode_files:
269+
if curr_read == curr_bc:
270+
curr_read_sans_ext = curr_read[0] + curr_read[1].split('.f')[0]
271+
try:
272+
matching_files[read_indicator.join(curr_read)] =\
273+
(barcode_indicator.join(curr_bc),
274+
mapping_files[curr_read_sans_ext])
275+
except KeyError:
276+
raise KeyError("Found read file with no matching mapping "
277+
"file: %s" % read_indicator.join(curr_read))
278+
return matching_files
279+
280+
281+
def _clean_leading_text(leading_text):
282+
leading_text = leading_text.strip()
283+
if leading_text:
284+
return leading_text + ' '
285+
else:
286+
return leading_text
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
@HWI-ST753:99:C038WACXX:4:1101:1252:1987 1:N:0:
2+
NACGGAGGGTGCGAGCGTTGTCCGGAATCATTGGGCGTAAAGGGCGCGTAGGTGGCCCGGTCAGCGTGCGGTGACAGCTCGGCGCTCAACCACGAGTAGG
3+
+
4+
#1=DDBDFHCFHHIIJGDHIJJJJIFGGHCGGGGEGIEGCHHHEDB@B8?;@AAB?<?7<75;CCB##################################
5+
@HWI-ST753:99:C038WACXX:4:1101:1357:1989 1:N:0:
6+
NACGGGGGGGGCAAGCGTTGTTCGGAATTACTGGGCGTAAAGGGTTCGTAGGTGGCCTACTAAGTCAGACGAGAGCTCACTCAGCATATCTGGCGCACTG
7+
+
8+
#1=DDFFFDDD@6@BBB8?B?BCBDBBDDCDCDD?BB79?ACC?+8?:8??B@@B#############################################
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
@HWI-ST753:99:C038WACXX:4:1101:1252:1987 1:N:0:
2+
NACGGAGGGTGCGAGCGTTGTCCGGAATCATTGGGCGTAAAGGGCGCGTAGGTGGCCCGGTCAGCGTGCGGTGACAGCTCGGCGCTCAACCACGAGTAGG
3+
+
4+
#1=DDBDFHCFHHIIJGDHIJJJJIFGGHCGGGGEGIEGCHHHEDB@B8?;@AAB?<?7<75;CCB##################################
5+
@HWI-ST753:99:C038WACXX:4:1101:1357:1989 1:N:0:
6+
NACGGGGGGGGCAAGCGTTGTTCGGAATTACTGGGCGTAAAGGGTTCGTAGGTGGCCTACTAAGTCAGACGAGAGCTCACTCAGCATATCTGGCGCACTG
7+
+
8+
#1=DDFFFDDD@6@BBB8?B?BCBDBBDDCDCDD?BB79?ACC?+8?:8??B@@B#############################################
Binary file not shown.
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
@HWI-ST753:99:C038WACXX:4:1101:1252:1987 1:N:0:
2+
NACGGAGGGTGCGAGCGTTGTCCGGAATCATTGGGCGTAAAGGGCGCGTAGGTGGCCCGGTCAGCGTGCGGTGACAGCTCGGCGCTCAACCACGAGTAGG
3+
+
4+
#1=DDBDFHCFHHIIJGDHIJJJJIFGGHCGGGGEGIEGCHHHEDB@B8?;@AAB?<?7<75;CCB##################################
5+
@HWI-ST753:99:C038WACXX:4:1101:1357:1989 1:N:0:
6+
NACGGGGGGGGCAAGCGTTGTTCGGAATTACTGGGCGTAAAGGGTTCGTAGGTGGCCTACTAAGTCAGACGAGAGCTCACTCAGCATATCTGGCGCACTG
7+
+
8+
#1=DDFFFDDD@6@BBB8?B?BCBDBBDDCDCDD?BB79?ACC?+8?:8??B@@B#############################################
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
@HWI-ST753:99:C038WACXX:4:1101:1252:1987 2:N:0:
2+
TCGTCGATAATCA
3+
+
4+
====ADDDHFHHF
5+
@HWI-ST753:99:C038WACXX:4:1101:1357:1989 2:N:0:
6+
ACGTGTACCCAAA
7+
+
8+
@CCFFFDEHHHGF
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
@HWI-ST753:99:C038WACXX:4:1101:1252:1987 1:N:0:
2+
NACGGAGGGTGCGAGCGTTGTCCGGAATCATTGGGCGTAAAGGGCGCGTAGGTGGCCCGGTCAGCGTGCGGTGACAGCTCGGCGCTCAACCACGAGTAGG
3+
+
4+
#1=DDBDFHCFHHIIJGDHIJJJJIFGGHCGGGGEGIEGCHHHEDB@B8?;@AAB?<?7<75;CCB##################################
5+
@HWI-ST753:99:C038WACXX:4:1101:1357:1989 1:N:0:
6+
NACGGGGGGGGCAAGCGTTGTTCGGAATTACTGGGCGTAAAGGGTTCGTAGGTGGCCTACTAAGTCAGACGAGAGCTCACTCAGCATATCTGGCGCACTG
7+
+
8+
#1=DDFFFDDD@6@BBB8?B?BCBDBBDDCDCDD?BB79?ACC?+8?:8??B@@B#############################################
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
@HWI-ST753:99:C038WACXX:4:1101:1252:1987 2:N:0:
2+
TCGTCGATAATCA
3+
+
4+
====ADDDHFHHF
5+
@HWI-ST753:99:C038WACXX:4:1101:1357:1989 2:N:0:
6+
ACGTGTACCCAAA
7+
+
8+
@CCFFFDEHHHGF
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
extract_barcodes:input_type barcode_paired_end
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
@HWI-ST753:99:C038WACXX:4:1101:1252:1987 1:N:0:
2+
NACGGAGGGTGCGAGCGTTGTCCGGAATCATTGGGCGTAAAGGGCGCGTAGGTGGCCCGGTCAGCGTGCGGTGACAGCTCGGCGCTCAACCACGAGTAGG
3+
+
4+
#1=DDBDFHCFHHIIJGDHIJJJJIFGGHCGGGGEGIEGCHHHEDB@B8?;@AAB?<?7<75;CCB##################################
5+
@HWI-ST753:99:C038WACXX:4:1101:1357:1989 1:N:0:
6+
NACGGGGGGGGCAAGCGTTGTTCGGAATTACTGGGCGTAAAGGGTTCGTAGGTGGCCTACTAAGTCAGACGAGAGCTCACTCAGCATATCTGGCGCACTG
7+
+
8+
#1=DDFFFDDD@6@BBB8?B?BCBDBBDDCDCDD?BB79?ACC?+8?:8??B@@B#############################################
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
@HWI-ST753:99:C038WACXX:4:1101:1252:1987 2:N:0:
2+
TCGTCGATAATCA
3+
+
4+
====ADDDHFHHF
5+
@HWI-ST753:99:C038WACXX:4:1101:1357:1989 2:N:0:
6+
ACGTGTACCCAAA
7+
+
8+
@CCFFFDEHHHGF
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
@HWI-ST753:99:C038WACXX:4:1101:1252:1987 1:N:0:
2+
NACGGAGGGTGCGAGCGTTGTCCGGAATCATTGGGCGTAAAGGGCGCGTAGGTGGCCCGGTCAGCGTGCGGTGACAGCTCGGCGCTCAACCACGAGTAGG
3+
+
4+
#1=DDBDFHCFHHIIJGDHIJJJJIFGGHCGGGGEGIEGCHHHEDB@B8?;@AAB?<?7<75;CCB##################################
5+
@HWI-ST753:99:C038WACXX:4:1101:1357:1989 1:N:0:
6+
NACGGGGGGGGCAAGCGTTGTTCGGAATTACTGGGCGTAAAGGGTTCGTAGGTGGCCTACTAAGTCAGACGAGAGCTCACTCAGCATATCTGGCGCACTG
7+
+
8+
#1=DDFFFDDD@6@BBB8?B?BCBDBBDDCDCDD?BB79?ACC?+8?:8??B@@B#############################################
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
@HWI-ST753:99:C038WACXX:4:1101:1252:1987 2:N:0:
2+
TCGTCGATAATCA
3+
+
4+
====ADDDHFHHF
5+
@HWI-ST753:99:C038WACXX:4:1101:1357:1989 2:N:0:
6+
ACGTGTACCCAAA
7+
+
8+
@CCFFFDEHHHGF
Binary file not shown.
Binary file not shown.

0 commit comments

Comments
 (0)