biocore
diff --git a/‎ChangeLog.md
Lines changed: 1 addition & 0 deletions b/‎ChangeLog.md
Lines changed: 1 addition & 0 deletions
diff --git a/‎qiime/workflow/preprocess.py
Lines changed: 286 additions & 0 deletions b/‎qiime/workflow/preprocess.py
Lines changed: 286 additions & 0 deletions
diff --git a/‎qiime_test_data/multiple_extract_barcodes/input_files/unmatched1_R1_zz_sample.fastq
Lines changed: 8 additions & 0 deletions b/‎qiime_test_data/multiple_extract_barcodes/input_files/unmatched1_R1_zz_sample.fastq
Lines changed: 8 additions & 0 deletions
diff --git a/‎qiime_test_data/multiple_extract_barcodes/input_files/xxxx_R1_sample.fastq
Lines changed: 8 additions & 0 deletions b/‎qiime_test_data/multiple_extract_barcodes/input_files/xxxx_R1_sample.fastq
Lines changed: 8 additions & 0 deletions
diff --git a/‎qiime_test_data/multiple_extract_barcodes/input_files/zzzz_R1_zz_sample.fastq.gz
285 Bytes b/‎qiime_test_data/multiple_extract_barcodes/input_files/zzzz_R1_zz_sample.fastq.gz
285 Bytes
diff --git a/‎qiime_test_data/multiple_extract_barcodes/input_folders/folder1/xxxx_forward_sample.fastq
Lines changed: 8 additions & 0 deletions b/‎qiime_test_data/multiple_extract_barcodes/input_folders/folder1/xxxx_forward_sample.fastq
Lines changed: 8 additions & 0 deletions
diff --git a/‎qiime_test_data/multiple_extract_barcodes/input_folders/folder1/xxxx_reverse_sample.fastq
Lines changed: 8 additions & 0 deletions b/‎qiime_test_data/multiple_extract_barcodes/input_folders/folder1/xxxx_reverse_sample.fastq
Lines changed: 8 additions & 0 deletions
diff --git a/‎qiime_test_data/multiple_extract_barcodes/input_folders/folder2/xxxx_forward_sample.fastq
Lines changed: 8 additions & 0 deletions b/‎qiime_test_data/multiple_extract_barcodes/input_folders/folder2/xxxx_forward_sample.fastq
Lines changed: 8 additions & 0 deletions
diff --git a/‎qiime_test_data/multiple_extract_barcodes/input_folders/folder2/xxxx_reverse_sample.fastq
Lines changed: 8 additions & 0 deletions b/‎qiime_test_data/multiple_extract_barcodes/input_folders/folder2/xxxx_reverse_sample.fastq
Lines changed: 8 additions & 0 deletions
diff --git a/‎qiime_test_data/multiple_extract_barcodes/qiime_parameters.txt
Lines changed: 1 addition & 0 deletions b/‎qiime_test_data/multiple_extract_barcodes/qiime_parameters.txt
Lines changed: 1 addition & 0 deletions
diff --git a/‎qiime_test_data/multiple_join_paired_ends/input_files/unmatched1_R1_zz_sample.fastq
Lines changed: 8 additions & 0 deletions b/‎qiime_test_data/multiple_join_paired_ends/input_files/unmatched1_R1_zz_sample.fastq
Lines changed: 8 additions & 0 deletions
diff --git a/‎qiime_test_data/multiple_join_paired_ends/input_files/unmatched2_R2_zz_sample.fastq
Lines changed: 8 additions & 0 deletions b/‎qiime_test_data/multiple_join_paired_ends/input_files/unmatched2_R2_zz_sample.fastq
Lines changed: 8 additions & 0 deletions
diff --git a/‎qiime_test_data/multiple_join_paired_ends/input_files/xxxx_R1_sample.fastq
Lines changed: 8 additions & 0 deletions b/‎qiime_test_data/multiple_join_paired_ends/input_files/xxxx_R1_sample.fastq
Lines changed: 8 additions & 0 deletions
diff --git a/‎qiime_test_data/multiple_join_paired_ends/input_files/xxxx_R2_sample.fastq
Lines changed: 8 additions & 0 deletions b/‎qiime_test_data/multiple_join_paired_ends/input_files/xxxx_R2_sample.fastq
Lines changed: 8 additions & 0 deletions
diff --git a/‎qiime_test_data/multiple_join_paired_ends/input_files/zzzz_R1_zz_sample.fastq.gz
285 Bytes b/‎qiime_test_data/multiple_join_paired_ends/input_files/zzzz_R1_zz_sample.fastq.gz
285 Bytes
diff --git a/‎qiime_test_data/multiple_join_paired_ends/input_files/zzzz_R2_zz_sample.fastq.gz
146 Bytes b/‎qiime_test_data/multiple_join_paired_ends/input_files/zzzz_R2_zz_sample.fastq.gz
146 Bytes
@@ -99,6 +99,7 @@ optionally a mapping file. Check out the new documentation for the naming conven
 * Removed ``make_otu_heatmap_html.py`` in favor of ``make_otu_heatmap.py`` (see discussion on [#1724](https://github.com/biocore/qiime/issues/1724)).
 * Fixed bug that resulted in samples being mislabeled in ``make_otu_heatmap.py`` when one of the following options was passed: ``--category``, ``--map_fname``, ``--sample_tree``, or ``--suppress_column_clustering``. This is discussed in [#1790](https://github.com/biocore/qiime/issues/1790).
 * Added ``--negate_sample_id_fp`` option to ``filter_samples_from_otu_table.py`` (see [#1117](https://github.com/biocore/qiime/issues/1117)).
+* Added three new workflow scripts for facilitating initial QIIME processing of already-demultiplexed fastq files, as these are commonly being provided by sequencing centers. These are: ``multiple_split_libraries_fastq.py``, ``multiple_join_paired_ends.py``, and ``multiple_extract_barcodes.py``.
 
 QIIME 1.8.0 (11 Dec 2013)
 =========================
 
@@ -0,0 +1,286 @@
+from __future__ import division
+
+__author__ = "William Walters"
+__copyright__ = "Copyright 2011, The QIIME Project"
+__credits__ = ["William Walters"]
+__license__ = "GPL"
+__version__ = "1.8.0-dev"
+__maintainer__ = "William Walters"
+__email__ = "William.A.Walters@colorado.edu"
+
+from os.path import join, basename, splitext
+
+def create_commands_jpe(pairs, base_output_dir, optional_params = "",
+        leading_text = "", trailing_text = "", include_input_dir_path=False,
+        remove_filepath_in_name=False, match_barcodes = False,
+        bc_pairs = {}):
+    """ Creates commands for join_paired_ends.py
+
+    pairs: dictionary of forward:reverse read filepaths
+    base_output_dir: output directory to write log, stitched reads
+    optional_params: added parameters to join_paired_ends.py calls
+    leading_text: Text to add before join_paired_ends.py call
+    trailing_text: Text to add after join_paired_ends.py call
+    include_input_dir_path: If True, include input directory in output
+        directory names
+    remove_filepath_in_name: If True, the base filename will not be used in the
+        output directory names.
+    match_barcodes: True to match barcodes.
+    bc_pairs: dictionary of read1:bc_read filepaths (empty if not used)
+    """
+
+    commands = []
+    extensions = ['.fastq.gz', '.fastq', '.fq.gz', '.fq']
+
+    for curr_fp in pairs:
+        for extension in extensions:
+            if extension in curr_fp:
+                curr_ext = extension
+        if include_input_dir_path:
+            added_output_str = curr_fp.split('/')[-2]
+        else:
+            added_output_str = ""
+        if not remove_filepath_in_name:
+            added_output_str += basename(curr_fp).split(curr_ext)[0]
+
+
+        curr_outputdir = join(base_output_dir, added_output_str)
+        if match_barcodes:
+            command = "%sjoin_paired_ends.py %s -b %s -f %s -r %s -o %s %s" %\
+                (_clean_leading_text(leading_text), optional_params, bc_pairs[curr_fp], curr_fp,
+                pairs[curr_fp], curr_outputdir, trailing_text)
+        else:
+            command = "%sjoin_paired_ends.py %s -f %s -r %s -o %s %s" %\
+                (_clean_leading_text(leading_text), optional_params, curr_fp, pairs[curr_fp],
+                curr_outputdir, trailing_text)
+
+        commands.append([('join_paired_ends.py: %s' % curr_fp, command)])
+
+    return commands
+
+def create_commands_eb(all_files, ispaired, base_output_dir,
+        optional_params = "", leading_text = "", trailing_text = "",
+        include_input_dir_path=False, remove_filepath_in_name=False):
+    """ Creates commands for extract_barcodes.py
+
+    all_files: list of input filelpaths or dict of paired files
+    ispaired: True if paired data
+    base_output_dir: output directory to write log, stitched reads
+    optional_params: added parameters to extract_barcodes.py calls
+    leading_text: Text to add before extract_barcodes.py call
+    trailing_text: Text to add after extract_barcodes.py call
+    include_input_dir_path: If True, include input directory in output
+        directory names
+    remove_filepath_in_name: If True, the base filename will not be used in the
+        output directory names.
+    """
+
+    commands = []
+    extensions = ['.fastq.gz', '.fastq', '.fq.gz', '.fq']
+
+    for curr_fp in all_files:
+        if include_input_dir_path:
+            added_output_str = curr_fp.split('/')[-2]
+        else:
+            added_output_str = ""
+        if not remove_filepath_in_name:
+            for extension in extensions:
+                if extension in curr_fp:
+                    curr_ext = extension
+            added_output_str += basename(curr_fp).split(curr_ext)[0]
+
+        curr_outputdir = join(base_output_dir, added_output_str)
+        if ispaired:
+            command = "%sextract_barcodes.py %s -f %s -r %s -o %s %s" %\
+            (_clean_leading_text(leading_text), optional_params, curr_fp, all_files[curr_fp],
+            curr_outputdir, trailing_text)
+        else:
+            command = "%sextract_barcodes.py %s -f %s -o %s %s" %\
+            (_clean_leading_text(leading_text), optional_params, curr_fp,
+            curr_outputdir, trailing_text)
+
+        commands.append([('extract_barcodes.py: %s' % curr_fp, command)])
+
+    return commands
+
+def create_commands_slf(all_files, demultiplexing_method, output_dir,
+        params = "", leading_text = "", trailing_text = "",
+        include_input_dir_path=False, remove_filepath_in_name=False,
+        sampleid_indicator = "_"):
+    """ Creates command for split_libraries_fastq.py
+
+    all_files: list of input filelpaths or dict of reads:(barcode,mapping)
+    demultiplexing_method: Either 'sampleid_by_file' or 'mapping_barcode_files'
+    output_dir: output directory to write split_libraries_fastq output
+    params: added parameters to split_libraries_fastq.py calls
+    leading_text: Text to add before split_libraries_fastq.py call
+    trailing_text: Text to add after split_libraries_fastq.py call
+    include_input_dir_path: If True, include input directory in output
+        directory names
+    remove_filepath_in_name: If True, the base filename will not be used in the
+        output directory names.
+    sampleid_indicator: Split on this character in input fastq filenames to
+        generate output SampleID name.
+    """
+
+    commands = []
+    read_files = []
+    barcode_files = []
+    mapping_files = []
+    sample_ids = []
+
+    # Using a set in this case to keep consistent order (needed for unit tests)
+    all_fps = set(all_files)
+
+    for curr_fp in all_fps:
+        read_files.append(curr_fp)
+        # Just need to build up a list of SampleID names
+        if demultiplexing_method == 'sampleid_by_file':
+            if include_input_dir_path:
+                sample_id = curr_fp.split('/')[-2]
+            else:
+                sample_id = ""
+            if not remove_filepath_in_name:
+                sample_id += basename(curr_fp).split(sampleid_indicator)[0]
+            sample_ids.append(sample_id)
+        # Need list of barcode filepaths, mapping filepaths
+        else:
+            barcode_files.append(all_files[curr_fp][0])
+            mapping_files.append(all_files[curr_fp][1])
+
+    if demultiplexing_method == 'sampleid_by_file':
+        command =\
+            "%ssplit_libraries_fastq.py %s -i %s --sample_ids %s -o %s %s --barcode_type 'not-barcoded'" %\
+            (_clean_leading_text(leading_text), params, ",".join(read_files), ",".join(sample_ids),
+            output_dir, trailing_text)
+    else:
+        command =\
+            "%ssplit_libraries_fastq.py %s -i %s --barcode_read_fps %s --mapping_fps %s -o %s %s" %\
+            (_clean_leading_text(leading_text), params, ",".join(read_files),
+            ",".join(barcode_files), ",".join(mapping_files),
+            output_dir, trailing_text)
+
+    commands.append([('split_libraries_fastq.py', command)])
+
+    return commands
+
+def get_pairs(all_files, read1_indicator, read2_indicator, match_barcodes=False,
+        barcode_indicator="_I1_"):
+    """ Finds pairs of files from a list of files, optionally matches barcodes
+
+    all_files: list of filepaths
+    read1_indicator: string indicating read 1 of a pair
+    read2_indicator: string indicating read 2 of a pair
+    match_barcodes: If True, will attempt to match up barcodes file
+    barcode_indicator: string indicating barcode file.
+    """
+
+    pairs = {}
+    bc_pairs = {}
+
+    read1_files = []
+    read2_files = []
+    bc_files = []
+
+    for curr_file in all_files:
+        curr_file_string_r1 = curr_file.split(read1_indicator)
+        curr_file_string_r2 = curr_file.split(read2_indicator)
+        if match_barcodes:
+            curr_file_string_bc = curr_file.split(barcode_indicator)
+
+        if len(curr_file_string_r1) == 2:
+            read1_files.append(curr_file_string_r1)
+        elif len(curr_file_string_r2) == 2:
+            read2_files.append(curr_file_string_r2)
+        elif match_barcodes and len(curr_file_string_bc) == 2:
+            bc_files.append(curr_file_string_bc)
+        else:
+            raise ValueError,("Invalid filename found for splitting on input "+\
+                "for file %s, " % curr_file + "check input read1_indicator "+\
+                "and read2_indicator parameters as well.")
+
+    for curr_read1 in read1_files:
+        for curr_read2 in read2_files:
+            if curr_read1 == curr_read2:
+                pairs[read1_indicator.join(curr_read1)] =\
+                    read2_indicator.join(curr_read2)
+
+    if match_barcodes:
+        for curr_read1 in read1_files:
+            for curr_bc in bc_files:
+                if curr_read1 == curr_bc:
+                    bc_pairs[read1_indicator.join(curr_read1)] =\
+                        barcode_indicator.join(curr_bc)
+        # Need a specific test if matched barcodes are used-the barcodes should
+        # match both the forward and reverse reads.
+        forward_reads = set(pairs.keys())
+        bc_reads = set(bc_pairs.keys())
+        non_matching_f_reads = forward_reads - bc_reads
+        if non_matching_f_reads:
+            raise ValueError,("Found forward reads without matching barcodes "
+                "file: %s" % non_matching_f_reads)
+
+    return pairs, bc_pairs
+
+def get_matching_files(all_fastq, all_mapping,
+        read_indicator, barcode_indicator, mapping_indicator):
+    """ Matches up read, barcode, and mapping files based on filenames
+
+    all_fastq: list of sequence filepaths
+    all_mapping: list of mapping filepaths
+    read_indicator: string indicating read file
+    barcode_indicator: string indicating barcode file
+    mapping_indicator: string indicating mapping file
+    """
+
+    read_files = []
+    barcode_files = []
+    mapping_files = {}
+    matching_files = {}
+
+    # Have to assume trailing text will not match extensions, so have to
+    # do some splitting at the extension point to match up.
+    for curr_file in all_mapping:
+        try:
+            curr_mapping = curr_file.split(mapping_indicator)
+            mapping_files[curr_mapping[0] +
+                splitext(curr_mapping[1])[0]] = curr_file
+        except IndexError:
+            raise IndexError(
+                "Found file with a mapping file extension that does not "
+                "contain the mapping file indicators (see mapping_indicator): "
+                "%s" % curr_file)
+
+    for curr_file in all_fastq:
+        curr_file_string_read = curr_file.split(read_indicator)
+        curr_file_string_bc = curr_file.split(barcode_indicator)
+
+        if len(curr_file_string_read) == 2:
+            read_files.append(curr_file_string_read)
+        elif len(curr_file_string_bc) == 2:
+            barcode_files.append(curr_file_string_bc)
+        else:
+            raise ValueError("Invalid filename found for splitting on input "+\
+                "for file %s, " % curr_file + "check input read indicator "+\
+                "and barcode indicator parameters.")
+
+    for curr_read in read_files:
+        for curr_bc in barcode_files:
+            if curr_read == curr_bc:
+                curr_read_sans_ext = curr_read[0] + curr_read[1].split('.f')[0]
+                try:
+                    matching_files[read_indicator.join(curr_read)] =\
+                        (barcode_indicator.join(curr_bc),
+                        mapping_files[curr_read_sans_ext])
+                except KeyError:
+                    raise KeyError("Found read file with no matching mapping "
+                       "file: %s" % read_indicator.join(curr_read))
+    return matching_files
+
+
+def _clean_leading_text(leading_text):
+    leading_text = leading_text.strip()
+    if leading_text:
+        return leading_text + ' '
+    else:
+        return leading_text
@@ -0,0 +1,8 @@
+@HWI-ST753:99:C038WACXX:4:1101:1252:1987 1:N:0:
+NACGGAGGGTGCGAGCGTTGTCCGGAATCATTGGGCGTAAAGGGCGCGTAGGTGGCCCGGTCAGCGTGCGGTGACAGCTCGGCGCTCAACCACGAGTAGG
++
+#1=DDBDFHCFHHIIJGDHIJJJJIFGGHCGGGGEGIEGCHHHEDB@B8?;@AAB?<?7<75;CCB##################################
+@HWI-ST753:99:C038WACXX:4:1101:1357:1989 1:N:0:
+NACGGGGGGGGCAAGCGTTGTTCGGAATTACTGGGCGTAAAGGGTTCGTAGGTGGCCTACTAAGTCAGACGAGAGCTCACTCAGCATATCTGGCGCACTG
++
+#1=DDFFFDDD@6@BBB8?B?BCBDBBDDCDCDD?BB79?ACC?+8?:8??B@@B#############################################
@@ -0,0 +1,8 @@
+@HWI-ST753:99:C038WACXX:4:1101:1252:1987 1:N:0:
+NACGGAGGGTGCGAGCGTTGTCCGGAATCATTGGGCGTAAAGGGCGCGTAGGTGGCCCGGTCAGCGTGCGGTGACAGCTCGGCGCTCAACCACGAGTAGG
++
+#1=DDBDFHCFHHIIJGDHIJJJJIFGGHCGGGGEGIEGCHHHEDB@B8?;@AAB?<?7<75;CCB##################################
+@HWI-ST753:99:C038WACXX:4:1101:1357:1989 1:N:0:
+NACGGGGGGGGCAAGCGTTGTTCGGAATTACTGGGCGTAAAGGGTTCGTAGGTGGCCTACTAAGTCAGACGAGAGCTCACTCAGCATATCTGGCGCACTG
++
+#1=DDFFFDDD@6@BBB8?B?BCBDBBDDCDCDD?BB79?ACC?+8?:8??B@@B#############################################
@@ -0,0 +1,8 @@
+@HWI-ST753:99:C038WACXX:4:1101:1252:1987 1:N:0:
+NACGGAGGGTGCGAGCGTTGTCCGGAATCATTGGGCGTAAAGGGCGCGTAGGTGGCCCGGTCAGCGTGCGGTGACAGCTCGGCGCTCAACCACGAGTAGG
++
+#1=DDBDFHCFHHIIJGDHIJJJJIFGGHCGGGGEGIEGCHHHEDB@B8?;@AAB?<?7<75;CCB##################################
+@HWI-ST753:99:C038WACXX:4:1101:1357:1989 1:N:0:
+NACGGGGGGGGCAAGCGTTGTTCGGAATTACTGGGCGTAAAGGGTTCGTAGGTGGCCTACTAAGTCAGACGAGAGCTCACTCAGCATATCTGGCGCACTG
++
+#1=DDFFFDDD@6@BBB8?B?BCBDBBDDCDCDD?BB79?ACC?+8?:8??B@@B#############################################
@@ -0,0 +1,8 @@
+@HWI-ST753:99:C038WACXX:4:1101:1252:1987 2:N:0:
+TCGTCGATAATCA
++
+====ADDDHFHHF
+@HWI-ST753:99:C038WACXX:4:1101:1357:1989 2:N:0:
+ACGTGTACCCAAA
++
+@CCFFFDEHHHGF
@@ -0,0 +1,8 @@
+@HWI-ST753:99:C038WACXX:4:1101:1252:1987 1:N:0:
+NACGGAGGGTGCGAGCGTTGTCCGGAATCATTGGGCGTAAAGGGCGCGTAGGTGGCCCGGTCAGCGTGCGGTGACAGCTCGGCGCTCAACCACGAGTAGG
++
+#1=DDBDFHCFHHIIJGDHIJJJJIFGGHCGGGGEGIEGCHHHEDB@B8?;@AAB?<?7<75;CCB##################################
+@HWI-ST753:99:C038WACXX:4:1101:1357:1989 1:N:0:
+NACGGGGGGGGCAAGCGTTGTTCGGAATTACTGGGCGTAAAGGGTTCGTAGGTGGCCTACTAAGTCAGACGAGAGCTCACTCAGCATATCTGGCGCACTG
++
+#1=DDFFFDDD@6@BBB8?B?BCBDBBDDCDCDD?BB79?ACC?+8?:8??B@@B#############################################
@@ -0,0 +1,8 @@
+@HWI-ST753:99:C038WACXX:4:1101:1252:1987 2:N:0:
+TCGTCGATAATCA
++
+====ADDDHFHHF
+@HWI-ST753:99:C038WACXX:4:1101:1357:1989 2:N:0:
+ACGTGTACCCAAA
++
+@CCFFFDEHHHGF
@@ -0,0 +1 @@
+extract_barcodes:input_type barcode_paired_end
@@ -0,0 +1,8 @@
+@HWI-ST753:99:C038WACXX:4:1101:1252:1987 1:N:0:
+NACGGAGGGTGCGAGCGTTGTCCGGAATCATTGGGCGTAAAGGGCGCGTAGGTGGCCCGGTCAGCGTGCGGTGACAGCTCGGCGCTCAACCACGAGTAGG
++
+#1=DDBDFHCFHHIIJGDHIJJJJIFGGHCGGGGEGIEGCHHHEDB@B8?;@AAB?<?7<75;CCB##################################
+@HWI-ST753:99:C038WACXX:4:1101:1357:1989 1:N:0:
+NACGGGGGGGGCAAGCGTTGTTCGGAATTACTGGGCGTAAAGGGTTCGTAGGTGGCCTACTAAGTCAGACGAGAGCTCACTCAGCATATCTGGCGCACTG
++
+#1=DDFFFDDD@6@BBB8?B?BCBDBBDDCDCDD?BB79?ACC?+8?:8??B@@B#############################################
@@ -0,0 +1,8 @@
+@HWI-ST753:99:C038WACXX:4:1101:1252:1987 2:N:0:
+TCGTCGATAATCA
++
+====ADDDHFHHF
+@HWI-ST753:99:C038WACXX:4:1101:1357:1989 2:N:0:
+ACGTGTACCCAAA
++
+@CCFFFDEHHHGF
@@ -0,0 +1,8 @@
+@HWI-ST753:99:C038WACXX:4:1101:1252:1987 1:N:0:
+NACGGAGGGTGCGAGCGTTGTCCGGAATCATTGGGCGTAAAGGGCGCGTAGGTGGCCCGGTCAGCGTGCGGTGACAGCTCGGCGCTCAACCACGAGTAGG
++
+#1=DDBDFHCFHHIIJGDHIJJJJIFGGHCGGGGEGIEGCHHHEDB@B8?;@AAB?<?7<75;CCB##################################
+@HWI-ST753:99:C038WACXX:4:1101:1357:1989 1:N:0:
+NACGGGGGGGGCAAGCGTTGTTCGGAATTACTGGGCGTAAAGGGTTCGTAGGTGGCCTACTAAGTCAGACGAGAGCTCACTCAGCATATCTGGCGCACTG
++
+#1=DDFFFDDD@6@BBB8?B?BCBDBBDDCDCDD?BB79?ACC?+8?:8??B@@B#############################################
@@ -0,0 +1,8 @@
+@HWI-ST753:99:C038WACXX:4:1101:1252:1987 2:N:0:
+TCGTCGATAATCA
++
+====ADDDHFHHF
+@HWI-ST753:99:C038WACXX:4:1101:1357:1989 2:N:0:
+ACGTGTACCCAAA
++
+@CCFFFDEHHHGF
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+extract_barcodes:input_type barcode_paired_end`