diff --git a/README.md b/README.md index d493bf7..e53365d 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@ # TEFingerprint A Python library with CLI for fingerprinting transposable elements based on paired-end reads. -This project is currently in a pe-alpha state and rapidly changing. +This project is currently in a alpha state and rapidly changing. ## Development Environment @@ -93,25 +93,41 @@ bwa index -a bwtsw reference.fasta bwa index -a is repeats.fasta ``` -Runing the preprocess pipeline: +Map paired end reads to repeat elements using `BWA MEM` and convert to a sorted bam file with `samtools`: + +``` +bwa mem -t 8 \ + repeats.fasta \ + reads1.fastq \ + reads2.fastq \ + | samtools view -Su - \ + | samtools sort - \ + -o reads_mapped_to_repeats.bam +``` + +Index the resulting bam file with `samtools`: + +``` +samtools index reads_mapped_to_repeats.bam +``` + +Runing the preprocess program on the indexed bam file: ``` tef preprocess \ - reads1.fastq \ - reads2.fastq \ + reads_mapped_to_repeats.bam --reference reference.fasta \ - --repeats repeats.fasta \ --output danglers.bam \ --threads 8 ``` The output file `danglers.bam` contains reads mapped to the reference genome. Each of these reads is tagged with the repeat element that their pair was mapped to. -The `threads` argument specifies how many threads to use for alignment in bwa (python componants of the pipeline are currently single threaded). - -By default, the intermediate files are written to a temporary directory that is automatically removed when the pipeline is completed. These files can be saved by manually specifying a directory with the `--tempdir` option. +Arguments: -By default, the same tage used to store repeat element names associated with each read is `ME` (Mate Element). This can be changed with the `--mate_element_tag` option. +* The `threads` argument specifies how many threads to use for alignment in bwa (python components of the pipeline are currently single threaded). +* By default, the intermediate files are written to a temporary directory that is automatically removed when the pipeline is completed. These files can be saved by manually specifying a directory with the `--tempdir` option. +* By default, the same tag used to store repeat element names associated with each read is `ME` (Mate Element). This can be changed with the `--mate_element_tag` option. ### Fingerprint @@ -122,6 +138,7 @@ tef fingerprint danglers.bam \ -f family1 family2 ... \ -m 20 \ -e 500 \ + -q 30 \ -t 4 \ > fingerprint.gff ``` @@ -134,6 +151,7 @@ Arguments: * `-f/--families` Specifies the (super) families or grouping of repeated elements to fingerprint. These names are matched against the start of the mate element name i.e. the name `Gypsy` would treat reads with tagged with a mate element called `Gypsy3`, `Gypsy27` or `GypsyX` as the same. * `-m/--minreads` Specifies the minimum number of read (tips) required to form a cluster. It is used in combination with `-e/epsilon`. * `-e/epsilon` Specifies the maximum allowable distance among a set of read tips to be considered a (sub) cluster. Sub-clusters are calculated based on `-m/--minreads` and `-e/epsilon` and then overlapping sub-clusters are combined to create cluster. +* `-q/--mapping_quality` Specifies the minimum mapping quality allowed for reads. * `-t/--threads` Specifies the number of CPU threads to use. The maximum number of threads that may be used is the same as the number of references specified. Additional arguments: @@ -141,6 +159,7 @@ Additional arguments: * `--min_eps` The minimum value of epsilon to be used in hierarchical clustering. Defaults to `0`. * `--hierarchical_clustering` Specifies wether or not to use the hierarchical clustering algorithm in order to differentiate between proximate clusters. Defaults to `True`. * `--mate_element_tag` The sam tag used to specify the name of each reads mate element. Defaults to `ME`. + * `--feature_csv` Optionally specify the name of a CSV file to output containing feature data. ### Compare @@ -165,6 +184,7 @@ Arguments: * `-f/--families` Specifies the (super) families or grouping of repeated elements to fingerprint. These names are matched against the start of the mate element name i.e. the name `Gypsy` would treat reads with tagged with a mate element called `Gypsy3`, `Gypsy27` or `GypsyX` as the same. * `-m/--minreads` Specifies the minimum number of read (tips) required to form a cluster. It is used in combination with `-e/epsilon`. * `-e/epsilon` Specifies the maximum allowable distance among a set of read tips to be considered a (sub) cluster. Sub-clusters are calculated based on `-m/--minreads` and `-e/epsilon` and then overlapping sub-clusters are combined to create cluster. +* `-q/--mapping_quality` Specifies the minimum mapping quality allowed for reads. * `-b/--fingerprint_buffer` Specifies a distance (in base pairs) to buffer fingerprints by before combining them into comparative bins. This is used to ensure that small clusters, that are slightly offset in different samples, are treated as a single comparative bin. It also improves the robustness of comparisons by allowing more reads to be included in each bin. Defaults to `0` * `-t/--threads` Specifies the number of CPU threads to use. The maximum number of threads that may be used is the same as the number of references specified. @@ -175,7 +195,8 @@ Additional arguments: * `--hierarchical_clustering` Specifies wether or not to use the hierarchical clustering algorithm in order to differentiate between proximate clusters. Defaults to `True`. * `--bin_buffer` The same as `--fingerprint_buffer` but buffering is performed after fingerprints are combined, therefore less likely to combine slightly offset clusters. Defaults to `0` * `--mate_element_tag` The sam tag used to specify the name of each reads mate element. Defaults to `ME`. - +* `--feature_csv` Optionally specify the name of a CSV file to output containing (long-form) feature data. This produces one row of data per sample per feature. +* `--character_csv` Optionally specify the name of a CSV file to output containing a matrix of read counts. This produces one column per feature by one row per sample. ### Filter GFF @@ -210,4 +231,4 @@ Where `comparison.gff ` is a gff file and `comparison_filtered.gff` is a filtere Arguments: * `-c/--column_filters`: filters to apply to the first 8 standard gff3 columns. These should take the form `''` -* `-a/--attribute_filters`: filters to apply to the attributes column. These should take the form `''` \ No newline at end of file +* `-a/--attribute_filters`: filters to apply to the attributes column. These should take the form `''` diff --git a/README.rst b/README.rst index 6fdc4ed..0df5e0a 100644 --- a/README.rst +++ b/README.rst @@ -2,7 +2,7 @@ TEFingerprint ============= A Python library with CLI for fingerprinting transposable elements based -on paired-end reads. This project is currently in a pe-alpha state and +on paired-end reads. This project is currently in a alpha state and rapidly changing. Development Environment @@ -114,15 +114,32 @@ Index fasta files with bwa: bwa index -a bwtsw reference.fasta bwa index -a is repeats.fasta -Runing the preprocess pipeline: +Map paired end reads to repeat elements using ``BWA MEM`` and convert to +a sorted bam file with ``samtools``: :: - tef preprocess \ + bwa mem -t 8 \ + repeats.fasta \ reads1.fastq \ reads2.fastq \ + | samtools view -Su - \ + | samtools sort - \ + -o reads_mapped_to_repeats.bam + +Index the resulting bam file with ``samtools``: + +:: + + samtools index reads_mapped_to_repeats.bam + +Runing the preprocess program on the indexed bam file: + +:: + + tef preprocess \ + reads_mapped_to_repeats.bam --reference reference.fasta \ - --repeats repeats.fasta \ --output danglers.bam \ --threads 8 @@ -130,18 +147,18 @@ The output file ``danglers.bam`` contains reads mapped to the reference genome. Each of these reads is tagged with the repeat element that their pair was mapped to. -The ``threads`` argument specifies how many threads to use for alignment -in bwa (python componants of the pipeline are currently single -threaded). - -By default, the intermediate files are written to a temporary directory -that is automatically removed when the pipeline is completed. These -files can be saved by manually specifying a directory with the -``--tempdir`` option. +Arguments: -By default, the same tage used to store repeat element names associated -with each read is ``ME`` (Mate Element). This can be changed with the -``--mate_element_tag`` option. +- The ``threads`` argument specifies how many threads to use for + alignment in bwa (python components of the pipeline are currently + single threaded). +- By default, the intermediate files are written to a temporary + directory that is automatically removed when the pipeline is + completed. These files can be saved by manually specifying a + directory with the ``--tempdir`` option. +- By default, the same tag used to store repeat element names + associated with each read is ``ME`` (Mate Element). This can be + changed with the ``--mate_element_tag`` option. Fingerprint ~~~~~~~~~~~ @@ -154,6 +171,7 @@ Example usage: -f family1 family2 ... \ -m 20 \ -e 500 \ + -q 30 \ -t 4 \ > fingerprint.gff @@ -177,6 +195,8 @@ Arguments: of read tips to be considered a (sub) cluster. Sub-clusters are calculated based on ``-m/--minreads`` and ``-e/epsilon`` and then overlapping sub-clusters are combined to create cluster. +- ``-q/--mapping_quality`` Specifies the minimum mapping quality + allowed for reads. - ``-t/--threads`` Specifies the number of CPU threads to use. The maximum number of threads that may be used is the same as the number of references specified. @@ -190,6 +210,8 @@ Additional arguments: proximate clusters. Defaults to ``True``. - ``--mate_element_tag`` The sam tag used to specify the name of each reads mate element. Defaults to ``ME``. +- ``--feature_csv`` Optionally specify the name of a CSV file to output + containing feature data. Compare ~~~~~~~ @@ -226,6 +248,8 @@ Arguments: of read tips to be considered a (sub) cluster. Sub-clusters are calculated based on ``-m/--minreads`` and ``-e/epsilon`` and then overlapping sub-clusters are combined to create cluster. +- ``-q/--mapping_quality`` Specifies the minimum mapping quality + allowed for reads. - ``-b/--fingerprint_buffer`` Specifies a distance (in base pairs) to buffer fingerprints by before combining them into comparative bins. This is used to ensure that small clusters, that are slightly offset @@ -252,6 +276,12 @@ Additional arguments: to combine slightly offset clusters. Defaults to ``0`` - ``--mate_element_tag`` The sam tag used to specify the name of each reads mate element. Defaults to ``ME``. +- ``--feature_csv`` Optionally specify the name of a CSV file to output + containing (long-form) feature data. This produces one row of data + per sample per feature. +- ``--character_csv`` Optionally specify the name of a CSV file to + output containing a matrix of read counts. This produces one column + per feature by one row per sample. Filter GFF ~~~~~~~~~~ diff --git a/setup.py b/setup.py index 9b909ae..56d0df3 100644 --- a/setup.py +++ b/setup.py @@ -9,7 +9,7 @@ def read_file(file_name): setup(name='tefingerprint', - version='0.0.3', + version='0.1.0', author='Tim Millar', author_email='tim.millar@plantandfood.co.nz', url='https://github.com/PlantandFoodResearch/TEFingerprint', @@ -17,5 +17,5 @@ def read_file(file_name): long_description=read_file('README.MD'), scripts=['applications/tef'], packages=['tefingerprint'], - classifiers=['Development Status :: 2 - Pre-Alpha'] + classifiers=['Development Status :: 3 - Alpha'] ) diff --git a/tefingerprint/bamio.py b/tefingerprint/bamio.py index 5bb296d..175cc25 100644 --- a/tefingerprint/bamio.py +++ b/tefingerprint/bamio.py @@ -7,7 +7,7 @@ from itertools import product -def _read_bam_strings(bam, reference, strand): +def _read_bam_strings(bam, reference, strand, quality): """ Read strings from an indexed Bam file. @@ -17,14 +17,19 @@ def _read_bam_strings(bam, reference, strand): :type reference: str :param strand: Select reads that are found on a specific strand, '+' or '-' :type strand: str + :param quality: minimum mapping quality + :type quality: int :return: A list of Sam formatted strings :rtype: list[str] """ - SAM_FLAGS = {'+': ('-F', '20'), - '-': ('-f', '16', '-F', '4')} - flag = SAM_FLAGS[strand] - return np.array(pysam.view(*flag, bam, reference).splitlines()) + if strand == '+': + args = ('-q', str(quality), '-F', '20', bam, reference) + elif strand == '-': + args = ('-q', str(quality), '-f', '16', '-F', '4', bam, reference) + else: + raise ValueError + return np.array(pysam.view(*args).splitlines()) def _read_bam_reference_lengths(bam): @@ -95,7 +100,7 @@ def _parse_read_loci(strings): yield start, stop, name -def _read_bam_ref_strand_loci(bam, reference, strand, categories, tag='ME'): +def _read_bam_ref_strand_loci(bam, reference, strand, categories, quality, tag='ME'): """ Read a single strand of a single reference from a bam file and categories by their associated transposon. If a reference is specified by name only, its range will be extracted from the bam and appended. @@ -108,6 +113,8 @@ def _read_bam_ref_strand_loci(bam, reference, strand, categories, tag='ME'): :type strand: str :param categories: a list of transposon categories :type categories: list[str] + :param quality: minimum mapping quality + :type quality: int :param tag: the two letter sam tag containing the transposon associated with each read :type tag: str @@ -115,7 +122,7 @@ def _read_bam_ref_strand_loci(bam, reference, strand, categories, tag='ME'): :rtype: generator[((str, str, str, str), generator[((int, int, str))])] """ source = os.path.basename(bam) - strings = _read_bam_strings(bam, reference, strand) + strings = _read_bam_strings(bam, reference, strand, quality) if ':' in reference: pass else: @@ -128,7 +135,7 @@ def _read_bam_ref_strand_loci(bam, reference, strand, categories, tag='ME'): yield (reference, strand, category, source), loci -def _read_bam_ref_loci(bam, reference, categories, tag='ME'): +def _read_bam_ref_loci(bam, reference, categories, quality, tag='ME'): """ Read both strands of a single reference from a bam file and categories by their associated transposon. If a reference is specified by name only, its range will be extracted from the bam and appended. @@ -139,19 +146,21 @@ def _read_bam_ref_loci(bam, reference, categories, tag='ME'): :type reference: str :param categories: a list of transposon categories :type categories: list[str] + :param quality: minimum mapping quality + :type quality: int :param tag: the two letter sam tag containing the transposon associated with each read :type tag: str :return: a generator of category tuples and loci tuple generators :rtype: generator[((str, str, str, str), generator[((int, int, str))])] """ - for block in _read_bam_ref_strand_loci(bam, reference, '+', categories, tag=tag): + for block in _read_bam_ref_strand_loci(bam, reference, '+', categories, quality, tag=tag): yield block - for block in _read_bam_ref_strand_loci(bam, reference, '-', categories, tag=tag): + for block in _read_bam_ref_strand_loci(bam, reference, '-', categories, quality, tag=tag): yield block -def extract_bam_reads(bams, categories, references=None, tag='ME'): +def extract_bam_reads(bams, categories, references=None, quality=30, tag='ME'): """ Extract reads from one or more bam file(s) and categories reads by their reference, strand, associated transposon, and source bam file. @@ -163,6 +172,8 @@ def extract_bam_reads(bams, categories, references=None, tag='ME'): :type references: str | list[str] :param categories: target transposon categories :type categories: str | list[str] + :param quality: minimum mapping quality + :type quality: int :param tag: the two letter sam tag containing the transposon associated with each read :type tag: str @@ -183,7 +194,7 @@ def extract_bam_reads(bams, categories, references=None, tag='ME'): # run jobs jobs = product(bams, references) for bam, reference in jobs: - for block in _read_bam_ref_loci(bam, reference, categories, tag=tag): + for block in _read_bam_ref_loci(bam, reference, categories, quality, tag=tag): yield block diff --git a/tefingerprint/batch.py b/tefingerprint/batch.py index 3f008fa..d5d0dc2 100644 --- a/tefingerprint/batch.py +++ b/tefingerprint/batch.py @@ -5,16 +5,34 @@ from tefingerprint import loci -def _fingerprint_worker(bams, categories, reference, transposon_tag, min_reads, eps, min_eps, hierarchical): - """Worker function for batch fingerprinting""" - reads = loci.append(*[loci.ReadLoci.from_bam(bam, categories, references=reference, tag=transposon_tag) - for bam in bams]) - return reads.fingerprint(min_reads, eps, min_eps=min_eps, hierarchical=hierarchical) +def _fingerprint_worker(bams, + categories, + reference, + quality, + transposon_tag, + min_reads, + eps, + min_eps, + hierarchical): + """ + Worker function for batch fingerprinting. + Runs a single job dispatched from fingerprint. + """ + reads = loci.append(*[loci.ReadLoci.from_bam(bam, + categories, + references=reference, + quality=quality, + tag=transposon_tag) for bam in bams]) + return reads.fingerprint(min_reads, + eps, + min_eps=min_eps, + hierarchical=hierarchical) def fingerprint(bams=None, categories=None, references=None, + quality=30, transposon_tag='ME', min_reads=None, eps=None, @@ -30,6 +48,8 @@ def fingerprint(bams=None, :type categories: list[str] :param references: targeted (slices of) references with format 'name' or 'name:minimum-maximum' :type references: list[str] + :param quality: minimum mapping quality + :type quality: int :param transposon_tag: the two letter sam tag containing the transposon associated with each read :type transposon_tag: str :param min_reads: minimum number of read tips required to form a fingerprint locus @@ -46,7 +66,15 @@ def fingerprint(bams=None, :return: Fingerprints of bam files :rtype: :class:`loci.FingerPrint` """ - jobs = product([bams], [categories], references, [transposon_tag], [min_reads], [eps], [min_eps], [hierarchical]) + jobs = product([bams], + [categories], + references, # job per reference + [quality], + [transposon_tag], + [min_reads], + [eps], + [min_eps], + [hierarchical]) if cores == 1: return loci.append(*[_fingerprint_worker(*job) for job in jobs]) @@ -55,9 +83,26 @@ def fingerprint(bams=None, return loci.append(*pool.starmap(_fingerprint_worker, jobs)) -def _comparison_worker(bams, categories, reference, transposon_tag, min_reads, eps, min_eps, hierarchical, fingerprint_buffer, bin_buffer): - """Worker function for batch fingerprint comparisons""" - reads = loci.append(*[loci.ReadLoci.from_bam(bam, categories, references=reference, tag=transposon_tag) +def _comparison_worker(bams, + categories, + reference, + quality, + transposon_tag, + min_reads, + eps, + min_eps, + hierarchical, + fingerprint_buffer, + bin_buffer): + """ + Worker function for batch fingerprint comparisons. + Runs a single job dispatched from comparison. + """ + reads = loci.append(*[loci.ReadLoci.from_bam(bam, + categories, + references=reference, + quality=quality, + tag=transposon_tag) for bam in bams]) fprint = reads.fingerprint(min_reads, eps, min_eps=min_eps, hierarchical=hierarchical) fprint.buffer(fingerprint_buffer) @@ -69,6 +114,7 @@ def _comparison_worker(bams, categories, reference, transposon_tag, min_reads, e def comparison(bams=None, categories=None, references=None, + quality=30, transposon_tag='ME', min_reads=None, eps=None, @@ -86,6 +132,8 @@ def comparison(bams=None, :type categories: list[str] :param references: targeted (slices of) references with format 'name' or 'name:minimum-maximum' :type references: list[str] + :param quality: minimum mapping quality + :type quality: int :param transposon_tag: the two letter sam tag containing the transposon associated with each read :type transposon_tag: str :param min_reads: minimum number of read tips required to form a fingerprint locus @@ -106,7 +154,17 @@ def comparison(bams=None, :return: Fingerprint comparisons of bam files :rtype: :class:`loci.Comparison` """ - jobs = product([bams], [categories], references, [transposon_tag], [min_reads], [eps], [min_eps], [hierarchical], [fingerprint_buffer], [bin_buffer]) + jobs = product([bams], + [categories], + references, # job per reference + [quality], + [transposon_tag], + [min_reads], + [eps], + [min_eps], + [hierarchical], + [fingerprint_buffer], + [bin_buffer]) if cores == 1: return loci.append(*[_comparison_worker(*job) for job in jobs]) diff --git a/tefingerprint/loci.py b/tefingerprint/loci.py index 45ac977..8f4a6b1 100644 --- a/tefingerprint/loci.py +++ b/tefingerprint/loci.py @@ -1,5 +1,6 @@ #! /usr/bin/env python +import re import numpy as np from tefingerprint import bamio from tefingerprint import cluster @@ -274,8 +275,8 @@ def as_array(self): array = np.append(array, sub_array) # this method is faster and avoids sorting on objects in the array (possible source of errors) # but does not sort on other values so sub-orders may vary - index = np.argsort(array[['reference', 'start', 'stop']], - order=('reference', 'start', 'stop')) + index = np.argsort(array[['reference', 'start', 'stop', 'category']], + order=('reference', 'start', 'stop', 'category')) array = array[index] return array @@ -397,7 +398,7 @@ class ReadLoci(GenomeLoci): ('name', np.str_, 254)]) @classmethod - def from_bam(cls, bams, categories, references=None, tag='ME'): + def from_bam(cls, bams, categories, references=None, quality=30, tag='ME'): """ Create an object of :class:`ReadLoci` from a bam file. The bam file should contain aligned reads without pairs where each read is tagged with the category of @@ -410,6 +411,8 @@ def from_bam(cls, bams, categories, references=None, tag='ME'): :type references: str :param categories: A list of transposon categories to group reads by :type categories: list[str] + :param quality: minimum mapping quality + :type quality: int :param tag: The sam tag containing the transposon each read corresponds to :type tag: str @@ -421,6 +424,7 @@ def from_bam(cls, bams, categories, references=None, tag='ME'): for group, loci in bamio.extract_bam_reads(bams, categories, references=references, + quality=quality, tag=tag)} return reads @@ -538,6 +542,19 @@ class FingerPrint(GenomeLoci): ('start', np.int64), ('stop', np.int64)]) + def as_csv(self): + """ + Convert all loci to a a CSV formatted string sorted by location. + + :return: a CSV formatted string + :rtype: str + """ + array = self.as_array() + header = ','.join(array.dtype.names) + data = str(array).strip('[]').replace("(", "").replace(")", "").replace("'", '"').replace(" ", "") + data = re.sub(r':\S*?"', '"', data) + return header + '\n' + data + @staticmethod def _format_gff_feature(record): """ @@ -776,8 +793,8 @@ def as_flat_array(self): array = np.append(array, sub_array) # this method is faster and avoids sorting on objects in the array (possible source of errors) # but does not sort on other values so sub-orders may vary - index = np.argsort(array[['reference', 'start', 'stop']], - order=('reference', 'start', 'stop')) + index = np.argsort(array[['reference', 'start', 'stop', 'category']], + order=('reference', 'start', 'stop', 'category')) array = array[index] return array @@ -826,6 +843,50 @@ def as_flat_gff(self): sample_numbers = {k: v for v, k in enumerate(np.sort(np.unique(array['source'])))} return '\n'.join((self._format_flat_gff_feature(record, sample_numbers) for record in array)) + def as_flat_csv(self): + """ + Convert all loci to a a CSV formatted string sorted by location. + This method creates one entry per sample per locus to avoid nested structures. + + :return: a CSV formatted string + :rtype: str + """ + array = self.as_flat_array() + header = ','.join(array.dtype.names) + data = str(array).strip('[]').replace("(", "").replace(")", "").replace("'", '"').replace(" ", "") + data = re.sub(r':\S*?"', '"', data) + return header + '\n' + data + + def as_character_array(self): + """ + Formats comparision results as an array of character scores. + + This can easily be converted to a pandas dataframe using `pandas.DataFrame(*self.as_character_array())`. + + :return: a tuple containing an array of character states and an array of sample names + """ + array = self.as_array() + dtype = [("{0}:{1}-{2}_{3}_{4}".format(item['reference'].split(':')[0], + item['start'], + item['stop'], + item['strand'], + item['category']), np.int64) for item in array] + dtype = np.dtype(dtype) + samples = array[0]['sources'] + characters = np.array([*array['counts']]).transpose().ravel().view(dtype=dtype) + return characters, samples + + def as_character_csv(self): + """ + Formats comparision results as an csv of character scores. + + :return: a csv formatted string of character scores + :rtype: str + """ + array, names = self.as_character_array() + columns = '"sample",' + str(array.dtype.names).strip('()').replace("'", '"').replace(" ", "") + characters = '\n'.join([str(row[1]) + ', ' + str(row[0]).strip('()').replace(" ", "") for row in zip(array, names)]) + return columns + '\n' + characters if __name__ == '__main__': pass diff --git a/tefingerprint/preprocess.py b/tefingerprint/preprocess.py index bea5eed..ab05352 100644 --- a/tefingerprint/preprocess.py +++ b/tefingerprint/preprocess.py @@ -6,22 +6,25 @@ import os import shutil import sys +import re from tempfile import mkdtemp class PreProcessProgram(object): """""" - def __init__(self, fastq_1, fastq_2, + def __init__(self, dangler_bam, reference_fasta, - repeats_fasta, output_bam, + include_tails=True, + tail_minimum_length=38, mate_element_tag='ME', - temp_dir=False, threads=1): - self.fastq_1 = fastq_1 - self.fastq_2 = fastq_2 + temp_dir=False, + threads=1): + self.dangler_bam = dangler_bam self.reference_fasta = reference_fasta - self.repeats_fasta = repeats_fasta self.output_bam = output_bam + self.include_tails = include_tails + self.tail_minimum_length = tail_minimum_length self.mate_element_tag = mate_element_tag self.temp_dir = temp_dir self.threads = str(threads) @@ -37,22 +40,28 @@ def _cli(args): :return: Dictionary like object of arguments and values """ parser = argparse.ArgumentParser('Identify potential TE flanking regions') - parser.add_argument('reads', + parser.add_argument('dangler_bam', type=str, - nargs=2, - help='A pair of fastq files containing paired end reads') + nargs=1, + help='A bam file containing dangler reads') parser.add_argument('--reference', type=str, nargs=1, help="Reference genome in fasta format with bwa index") - parser.add_argument('--repeats', - type=str, - nargs=1, - help="Library of repeat elements in fasta format with bwa index") parser.add_argument('-o', '--output', type=str, nargs=1, help="Name of output bam file") + parser.add_argument('--include_tails', + type=bool, + nargs=1, + default=[True], + help="Include soft-clipped tails of pairs properly mapping to a repeat element") + parser.add_argument('--tail_minimum_length', + nargs=1, + type=int, + default=[38], + help="Minimum required length for inclusion of soft-clipped tails") parser.add_argument('--mate_element_tag', type=str, nargs=1, @@ -73,10 +82,8 @@ def _cli(args): @staticmethod def from_cli(args): arguments = PreProcessProgram._cli(args) - return PreProcessProgram(arguments.reads[0], - arguments.reads[1], + return PreProcessProgram(arguments.dangler_bam[0], arguments.reference[0], - arguments.repeats[0], arguments.output[0], mate_element_tag=arguments.mate_element_tag[0], temp_dir=arguments.tempdir[0], @@ -84,44 +91,53 @@ def from_cli(args): def _run_pipeline(self, scratch_dir): """""" - # map reads to repeats and store as temp file - temp_bam_1 = os.path.join(scratch_dir, '1_pairedReadsMappedToRepeats.bam') - print('>>> Mapping paired reads to repeat library at: {0}'.format(temp_bam_1)) - map_pairs_to_repeat_elements(self.fastq_1, - self.fastq_2, - self.repeats_fasta, - temp_bam_1, - self.threads) - - # index bam - print('>>> Indexing: {0}'.format(temp_bam_1)) - index_bam(temp_bam_1) - # extract danglers from bam - print('>>> Extracting dangler reads from: {0}'.format(temp_bam_1)) - dangler_strings = extract_danglers(temp_bam_1) - - # convert dangler strings to temp fastq - temp_fastq_danglers = os.path.join(scratch_dir, '2_danglerReads.fastq') - print('>>> Writing dangler reads to: {0}'.format(temp_fastq_danglers)) - sam_strings_to_fastq(dangler_strings, - temp_fastq_danglers) + print('>>> Extracting dangler reads from: {0}'.format(self.dangler_bam)) + forward_dangler_strings = extract_forward_danglers(self.dangler_bam) + reverse_dangler_strings = extract_reverse_danglers(self.dangler_bam) + # store sam strings for latter tagging + sam_strings = forward_dangler_strings + reverse_dangler_strings + + # convert dangler strings to fastq + fastq_lines = '' + fastq_lines += '\n'.join((forward_danglers_as_fastq(forward_dangler_strings))) + fastq_lines += '\n' + '\n'.join((reverse_danglers_as_fastq(reverse_dangler_strings))) + + if self.include_tails: + # extract soft clipped tails from bam + print('>>> Extracting soft clipped tails from: {0}'.format(self.dangler_bam)) + foward_reads = extract_forward_read_of_mapped_pair(self.dangler_bam) + reverse_reads = extract_reverse_read_of_mapped_pair(self.dangler_bam) + sam_strings += foward_reads + reverse_reads + + # add soft clipped tail strings to fastq + fastq_lines += '\n' + '\n'.join(forward_soft_clipped_tails_as_fastq(foward_reads, + min_length=self.tail_minimum_length)) + fastq_lines += '\n' + '\n'.join(reverse_soft_clipped_tails_as_fastq(reverse_reads, + min_length=self.tail_minimum_length)) + + # write temp fastq + temp_fastq = os.path.join(scratch_dir, '1_danglerReads.fastq') + print('>>> Writing dangler reads to: {0}'.format(temp_fastq)) + with open(temp_fastq, 'w') as f: + for line in fastq_lines: + f.write(line) # map danglers to reference - temp_bam_2 = os.path.join(scratch_dir, '3_danglerReadsMappedToReference.bam') - print('>>> Mapping dangler reads to reference at: {0}'.format(temp_bam_2)) - map_danglers_to_reference(temp_fastq_danglers, + temp_bam = os.path.join(scratch_dir, '2_danglerReadsMappedToReference.bam') + print('>>> Mapping dangler reads to reference at: {0}'.format(temp_bam)) + map_danglers_to_reference(temp_fastq, self.reference_fasta, - temp_bam_2, + temp_bam, self.threads) # index bam - print('>>> Indexing: {0}'.format(temp_bam_2)) - index_bam(temp_bam_2) + print('>>> Indexing: {0}'.format(temp_bam)) + index_bam(temp_bam) # tag danglers and write output file print('>>> Adding tags to mapped danglers at: {0}'.format(self.output_bam)) - tag_danglers(temp_bam_2, dangler_strings, self.output_bam, self.mate_element_tag) + tag_danglers(temp_bam, sam_strings, self.output_bam, self.mate_element_tag) # index bam print('>>> Indexing: {0}'.format(self.output_bam)) @@ -176,20 +192,6 @@ def _check_programs_installed(*args): return True -def index_fasta(fasta): - """ - Create index files for bwa mem - - :param fasta: fasta file to index - :type fasta: str - - :return: subprocess code - :rtype: int - """ - _check_programs_installed('bwa') - return subprocess.call(['bwa index -a is ' + fasta], shell=True) - - def index_bam(bam): """ Create index files for bwa mem @@ -228,33 +230,202 @@ def map_pairs_to_repeat_elements(fastq_1, fastq_2, repeats_fasta, output_bam, th return subprocess.call([command], shell=True) -def extract_danglers(bam): +def reverse_complement(sequence): + """ + Computes the reverse compliment of a nucleotide string. + + :param sequence: capitalised string of nucleotides {'A', 'T', 'G', 'C', 'N'} + :type sequence: str + + :return: capitalised reverse compliment string of nucleotides + :rtype: str + """ + complement = {'A': 'T', 'C': 'G', 'G': 'C', 'T': 'A', 'N': 'N'} + return "".join(complement.get(base, base) for base in reversed(sequence)) + + +def parse_sam_string(string): """ - Extracts unmapped reads with a mapped pair. + Parses the first 11 columns of a sam formatted string into a dictionary. + + :param string: sam formatted read + :type string: str + + :return: dictionary of named read attributes + :type: dict[str, str] + """ + attributes = string.split("\t") + return {'QNAME': attributes[0], + 'FLAG': attributes[1], + 'RNAME': attributes[2], + 'POS': attributes[3], + 'MAPQ': attributes[4], + 'CIGAR': attributes[5], + 'RNEXT': attributes[6], + 'PNEXT': attributes[7], + 'TLEN': attributes[8], + 'SEQ': attributes[9], + 'QUAL': attributes[10]} + + +def extract_forward_danglers(bam): + """ + Extracts unmapped reads with a mapped pair where the unmapped read does not have the reverse bit flag. :param bam: bam file of paired end reads aligned to repeat-element reference :type bam: str :return: sam formatted strings - :rtype: str + :rtype: list[str] + """ + return pysam.view('-F', '2304', '-F', '16', '-F', '8', '-f', '4', bam).splitlines() + + +def forward_danglers_as_fastq(sam_strings): + """ + Convert unmapped reads with a mapped pair to fastq format. + This function assumes the unmapped reads are not reverse complimented. + + :param sam_strings: sam formatted strings + :type sam_strings: list[str] + + :return: a list of fastq formatted reads + :rtype: list[str] + """ + reads = iter(parse_sam_string(string) for string in sam_strings) + # convert to fastq strings + fastqs = ["@{0}\n{1}\n+\n{2}".format(read['QNAME'], + read['SEQ'], + read['QUAL']) for read in reads] + return fastqs + + +def extract_reverse_danglers(bam): + """ + Extracts unmapped reads with a mapped pair where the unmapped read does have the reverse bit flag. + + :param bam: bam file of paired end reads aligned to repeat-element reference + :type bam: str + + :return: sam formatted strings + :rtype: list[str] + """ + return pysam.view('-F', '2304', '-f', '16', '-F', '8', '-f', '4', bam).splitlines() + + +def reverse_danglers_as_fastq(sam_strings): + """ + Convert unmapped reads with a mapped pair to fastq format. + This function assumes the unmapped reads have been reverse complimented and will revert them. + + :param sam_strings: sam formatted strings + :type sam_strings: list[str] + + :return: a list of fastq formatted reads + :rtype: list[str] + """ + reads = iter(parse_sam_string(string) for string in sam_strings) + # convert to fastq strings + fastqs = ["@{0}\n{1}\n+\n{2}".format(read['QNAME'], + reverse_complement(read['SEQ']), + read['QUAL'][::-1]) for read in reads] + return fastqs + + +def extract_forward_read_of_mapped_pair(bam): + """ + Extracts forward reads of proper mapped pairs. + + When mapping reads to repeat elements, sometimes both reads will map as a proper pair. + If one of these reads has a soft-clipped tail, it can be used as an additional 'dangler' read. + + :param bam: bam file of paired end reads aligned to repeat-element reference + :type bam: str + + :return: sam formatted strings + :rtype: list[str] """ - return pysam.samtools.view('-F', '0x800', '-F', '0x100', '-F', '8', '-f', '4', bam) + return pysam.view('-F', '2304', '-f', '2', '-F', '16', '-F', '4', '-F', '8', bam).splitlines() -def sam_strings_to_fastq(sam_strings, output_fastq): +def forward_soft_clipped_tails_as_fastq(sam_strings, min_length=38): """ - Extracts unmapped reads with a mapped pair and writes them to a fastq file. + Identify soft-clipped tails from forward reads to be used as additional danglers. + + The soft-clipped section is converted to fastq format for re-alignment to the reference genome + if the clipped section is longer than the specified 'min_length'. :param sam_strings: sam formatted strings - :type sam_strings: str - :param output_fastq: fastq file of non-mapped reads with pair mapping to repeat-element - :type output_fastq: str + :type sam_strings: list[str] + + :param min_length: minimum length of soft clipped section required (defaults to 20 base pairs) + :type min_length: int + + :return: a list of fastq formatted reads + :rtype: list[str] + """ + reads = [parse_sam_string(read) for read in sam_strings] + # extract soft clipped tails + clips = [re.findall(r"^[0-9]+[S]", read['CIGAR']) for read in reads] + # subset and pair reads with soft clipped tails + reads = [case for case in zip(reads, clips) if case[1]] + # clean up clip lengths + reads = [(read, int(clip[0].strip('S'))) for read, clip in reads] + # filter out short clips + reads = [(read, clip) for read, clip in reads if clip >= min_length] + # convert to fastq strings + fastqs = ['@{0}\n{1}\n+\n{2}'.format(read['QNAME'], + read['SEQ'][0: clip + 1], + read['QUAL'][0: clip + 1]) for read, clip in reads] + return fastqs + + +def extract_reverse_read_of_mapped_pair(bam): + """ + Extracts reverse reads of proper mapped pairs. + + When mapping reads to repeat elements, sometimes both reads will map as a proper pair. + If one of these reads has a soft-clipped tail, it can be used as an additional 'dangler' read. + + :param bam: bam file of paired end reads aligned to repeat-element reference + :type bam: str + + :return: sam formatted strings + :rtype: list[str] + """ + return pysam.view('-F', '2304', '-f', '2', '-f', '16', '-F', '4', '-F', '8', bam).splitlines() + + +def reverse_soft_clipped_tails_as_fastq(sam_strings, min_length=38): + """ + Identify soft-clipped tails from reverse reads to be used as additional danglers. + + The reverse compliment soft-clipped section is converted to fastq format for re-alignment to the reference genome + if the clipped section is longer than the specified 'min_length'. + + :param sam_strings: sam formatted strings + :type sam_strings: list[str] + + :param min_length: minimum length of soft clipped section required (defaults to 20 base pairs) + :type min_length: int + + :return: a list of fastq formatted reads + :rtype: list[str] """ - sam_attributes = iter(line.split('\t') for line in sam_strings.splitlines()) - fastq_lines = ("@{0}\n{1}\n+\n{2}\n".format(items[0], items[9], items[10]) for items in sam_attributes) - with open(output_fastq, 'w') as f: - for line in fastq_lines: - f.write(line) + reads = [parse_sam_string(read) for read in sam_strings] + # extract soft clipped tails + clips = [re.findall(r"[0-9]+[S]$", read['CIGAR']) for read in reads] + # subset and pair reads with soft clipped tails + reads = [case for case in zip(reads, clips) if case[1]] + # clean up clip lengths + reads = [(read, int(clip[0].strip('S'))) for read, clip in reads] + # filter out short clips + reads = [(read, clip) for read, clip in reads if clip >= min_length] + # convert to fastq strings + fastqs = ['@{0}\n{1}\n+\n{2}'.format(read['QNAME'], + reverse_complement(read['SEQ'][-clip:]), + read['QUAL'][-clip:][::-1]) for read, clip in reads] + return fastqs def map_danglers_to_reference(fastq, reference_fasta, output_bam, threads): @@ -279,18 +450,18 @@ def map_danglers_to_reference(fastq, reference_fasta, output_bam, threads): return subprocess.call([command], shell=True) -def tag_danglers(dangler_bam, dangler_strings, output_bam, tag): +def tag_danglers(dangler_bam, sam_strings, output_bam, tag): """ Tags mapped danglers with the id of the element which their mate mapped to and writes to new bam. :param dangler_bam: bam file of dangler reads aligned to reference genome :type dangler_bam: str - :param dangler_strings: sam formatted strings of non-mapped reads with pair mapping to repeat-element - :type dangler_strings: str + :param sam_strings: sam formatted strings of non-mapped reads with pair mapping to repeat-element + :type sam_strings: list[str] :param output_bam: bam file of dangler reads aligned to reference genome and tagged with mate element :type output_bam: str """ - sam_attributes = iter(line.split('\t') for line in dangler_strings.splitlines()) + sam_attributes = iter(line.split('\t') for line in sam_strings) read_mate_elements = {attrs[0]: attrs[2] for attrs in sam_attributes} danglers_untagged = pysam.Samfile(dangler_bam, "rb") danglers_tagged = pysam.Samfile(output_bam, "wb", template=danglers_untagged) diff --git a/tefingerprint/programs.py b/tefingerprint/programs.py index b4ff907..d4d07ee 100644 --- a/tefingerprint/programs.py +++ b/tefingerprint/programs.py @@ -12,6 +12,7 @@ def __init__(self, arguments): result = batch.fingerprint(bams=self.args.input_bams, references=self.args.references, categories=self.args.families, + quality=self.args.mapping_quality[0], transposon_tag=self.args.mate_element_tag[0], min_reads=self.args.min_reads[0], eps=self.args.epsilon[0], @@ -19,6 +20,9 @@ def __init__(self, arguments): hierarchical=self.args.hierarchical_clustering[0], cores=self.args.threads[0]) print(result.as_gff()) + if self.args.feature_csv[0]: + with open(self.args.feature_csv[0], 'w') as csv: + csv.write(result.as_csv()) @staticmethod def parse_args(args): @@ -46,6 +50,11 @@ def parse_args(args): help='TE categories to be used. ' 'These must be exact string match\'s to start of read name and are used to split ' 'reads into categories for analysis') + parser.add_argument('-q', '--mapping_quality', + type=int, + nargs=1, + default=[30], + help='Minimum mapping quality') parser.add_argument('--mate_element_tag', type=str, default=['ME'], @@ -87,6 +96,11 @@ def parse_args(args): nargs=1, help='By default hierarchical HUDC algorithm is used. If this is set to False, ' 'the non-hierarchical UDC algorithm is used and min_eps is inored.') + parser.add_argument('--feature_csv', + type=str, + default=[False], + nargs=1, + help='Optionally write a csv file of features.') parser.add_argument('-t', '--threads', type=int, default=[1], @@ -105,6 +119,7 @@ def __init__(self, arguments): result = batch.comparison(bams=self.args.input_bams, references=self.args.references, categories=self.args.families, + quality=self.args.mapping_quality[0], transposon_tag=self.args.mate_element_tag[0], min_reads=self.args.min_reads[0], eps=self.args.epsilon[0], @@ -117,6 +132,12 @@ def __init__(self, arguments): print(result.as_flat_gff()) else: print(result.as_gff()) + if self.args.character_csv[0]: + with open(self.args.character_csv[0], 'w') as csv: + csv.write(result.as_character_csv()) + if self.args.feature_csv[0]: + with open(self.args.feature_csv[0], 'w') as csv: + csv.write(result.as_flat_csv()) @staticmethod def parse_args(args): @@ -144,6 +165,11 @@ def parse_args(args): help='TE categories to be used. ' 'These must be exact string match\'s to start of read name and are used to split ' 'reads into categories for analysis') + parser.add_argument('-q', '--mapping_quality', + type=int, + nargs=1, + default=[30], + help='Minimum mapping quality') parser.add_argument('--mate_element_tag', type=str, default=['ME'], @@ -203,6 +229,18 @@ def parse_args(args): nargs=1, help='If True, the resulting gff file will contain one feature per sample per bin. ' 'This avoids nested lists in the feature attributes but results in many more features') + parser.add_argument('--feature_csv', + type=str, + default=[False], + nargs=1, + help='Optionally write a csv file of features ' + '(one row per sample per comparative bin)') + parser.add_argument('--character_csv', + type=str, + default=[False], + nargs=1, + help='Optionally write a csv file of data as character states ' + '(rows of samples * columns of comparative bins)') parser.add_argument('-t', '--threads', type=int, default=[1], diff --git a/tests/data/testA-2017-06-08.bam b/tests/data/testA-2017-06-08.bam new file mode 100644 index 0000000..e441a0b Binary files /dev/null and b/tests/data/testA-2017-06-08.bam differ diff --git a/tests/data/testA-2017-06-08.bam.bai b/tests/data/testA-2017-06-08.bam.bai new file mode 100644 index 0000000..9ab9ae6 Binary files /dev/null and b/tests/data/testA-2017-06-08.bam.bai differ diff --git a/tests/data/testA-2017-06-08.sam b/tests/data/testA-2017-06-08.sam new file mode 100644 index 0000000..cbfefdd --- /dev/null +++ b/tests/data/testA-2017-06-08.sam @@ -0,0 +1,53 @@ +@HD VN:1.3 SO:coordinate +@SQ SN:chr1 LN:25000 +@PG ID:bwa PN:bwa VN:0.7.12-r1044 CL:bwa mem -t 1 +read001 0 chr1 2353 60 100M * 0 0 CACGCAAGCCTCATTCCCTTCTCTCTATGCTTCCTGGCTTACCGTTAATTAAGATTCATGAAAGTGTTAGGGACAATGCCTCTTCTTTAGAACTGGCAAC 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:5 MD:Z:8A19C26G27G0A15 AS:i:75 XS:i:0 ME:Z:Gypsy_Gypsy26_chr15_18793972 +read002 0 chr1 2407 60 100M * 0 0 TGCATGAAAGTGTTAGGGACAATGCGTCTGATTTAGAACTGGCAACGACTTTGCTGGAAAATATCCCATCTGCTTGAGAGGGATTGTCAGGTCGCAAAAG 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:4 MD:Z:25C20A0T25G26 AS:i:80 XS:i:0 ME:Z:Gypsy_Gypsy26_chr15_18793972 +read003 0 chr1 2454 60 100M * 0 0 TCTTTGCTGGAAAATATCCCATCTGCGTGAGAGGGATTGTCAGGTCGCAAAAGGTGAAATTCAGAGAGGATGGCAACTGCACTCATCAACATCACAATGC 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:2 MD:Z:54G26A18 AS:i:90 XS:i:0 ME:Z:Gypsy_Gypsy26_chr15_18793972 +read004 0 chr1 2467 60 100M * 0 0 ATATCCCATCTGCGTGAGAGGGATTGTCAGGTCGCAAAAGGGGAAATTCAGAGAGGATGGCAACTGCAATCATCAACATCACAATGCCGTCATCAAGACC 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:1 MD:Z:87A12 AS:i:95 XS:i:0 ME:Z:Gypsy_Gypsy26_chr15_18793972 +read005 0 chr1 2478 60 100M * 0 0 GCGTGAGAGGGATTGTCAGGTCGGAAAAGGGGAAATTCAGAGAGGATGGCAACTGCAATCATCAACATCACAATGCAGTCATCAAGACCTTTGCACCCCA 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:1 MD:Z:23C76 AS:i:95 XS:i:0 ME:Z:Gypsy_Gypsy26_chr15_18793972 +read006 0 chr1 2478 60 100M * 0 0 TCGTGAGAGGGCTTGTCAGGTCGGAAAAGGGGAAATTCAGAGAGGATGGCAACTGCAATCATCAACATCACAATGCAGTCATCAAGAGCTTTGCACCCCA 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:4 MD:Z:0G10A11C63C12 AS:i:84 XS:i:0 ME:Z:Gypsy_Gypsy26_chr15_18793972 +read007 0 chr1 2779 60 63M * 0 0 TCCGGCAAGCACCTTTCATAAGTTTTCTTTCATTTTTTTATAATTGTTTTTCTTTATCTTGGT 222222222222222222222222222222222222222222222222222222222222222 NM:i:1 MD:Z:55T7 AS:i:58 XS:i:0 ME:Z:Gypsy_Gypsy26_chr15_18793972 +read008 0 chr1 2784 60 58M * 0 0 CAAGCACCTTTCATAAGTTTTCTTTCATTTTTTTATAATTGTTTTACTATTTCTTGGT 2222222222222222222222222222222222222222222222222222222222 NM:i:2 MD:Z:45T2T9 AS:i:48 XS:i:0 ME:Z:Gypsy_Gypsy26_chr15_18793972 +read009 0 chr1 2796 60 46M * 0 0 ATAAGTTTTCTTTCATTTTTTTATAATTGTTTTTCTTTTTCTTGGT 2222222222222222222222222222222222222222222222 NM:i:0 MD:Z:46 AS:i:46 XS:i:0 ME:Z:Gypsy_Gypsy26_chr8_2502854 +read010 0 chr1 2874 60 100M * 0 0 CTAAAATAATGAAATATGTAAGATAAATTCATATCAACTCAATTGGCCTTTGGTGGGACCCATATTCATAATATTTTCTTTGGAAAAAATTCAAGTAAAT 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:5 MD:Z:15T8T12A22A25T13 AS:i:75 XS:i:0 ME:Z:Gypsy_Gypsy26_chr18_27801424 +read011 0 chr1 2925 60 100M * 0 0 GGTGGGACCAATATTGATAATATTTTCTTTGGAAATAATTCAAGTAAATACCATAAAGTGCATGATTGATATTGTTTGATTTTAATCGGTTTTGTAAGAG 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:2 MD:Z:15C80T3 AS:i:91 XS:i:0 ME:Z:Gypsy_Gypsy26_chr8_5114633 +read012 0 chr1 2963 60 100M * 0 0 TTCAAGTAAATACCATAAAGTGCATGATTGATATTGTTTGATTTTAATCGGTTTTGTATGAGATATTTTCTACTTACTATTATGCACTAACCTTGTCTCT 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:1 MD:Z:69A30 AS:i:95 XS:i:0 ME:Z:Gypsy_Gypsy26_chr8_5114633 +read013 0 chr1 2997 60 43M5S * 0 0 TGTTTGATTTTAATCGGTTTTTTATGAGATATTTTATACTTACAATAA 222222222222222222222222222222222222222222222222 NM:i:1 MD:Z:21G21 AS:i:38 XS:i:0 ME:Z:Gypsy_Gypsy26_chr2_1987286 +read014 0 chr1 3039 60 100M * 0 0 CTATTATGCACTACCCTTGTCACTCATAATACCACATTGTTGATACACTATTCATGTCTGGATCATATTCTCTCTCAGTTGCTTTATCAATCTTTCCTCT 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:8 MD:Z:13A7T9A12T12A26G4T8A1 AS:i:63 XS:i:0 ME:Z:Gypsy_Gypsy26_chr18_27801424 +read015 16 chr1 3217 60 100M * 0 0 TCTCCTTATTTAGTAGAGACTCGTTTTTAAAGATTTAGAGGGATGTTATAGTCTTTTACCGTACTTTCCCAATAGATAACATGAACCTCAGATTCGACTT 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:1 MD:Z:2A97 AS:i:97 XS:i:0 ME:Z:Gypsy_Gypsy26_chr15_18793972 +read016 16 chr1 3226 60 100M * 0 0 TTAGTAGAGACTCGTTTTTAAAGATTTAGAGGGATGTTATAGTCTTTTACCGTACTTTCCCAATAGATAACTTGAACCTCAGATTCGACTTGGATTTTCC 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:2 MD:Z:71A19T8 AS:i:90 XS:i:0 ME:Z:Gypsy_Gypsy26_chr15_18793972 +read017 16 chr1 3246 60 100M * 0 0 AAGATTTAGAGGGATGTTATAGTCTTTTACCGTACTTTCCCAATAGATAACATGAACCTCAGATTCGACTTTGATTTTCCACGTATTGTTTTTTCCAAGT 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:1 MD:Z:98T1 AS:i:98 XS:i:0 ME:Z:Gypsy_Gypsy26_chr15_18793972 +read018 16 chr1 3405 60 81M * 0 0 ATAAGTGATGAATCAAAATCTTTTCTAAAAAAAAAAAAAAAATTTCACAAATAATAGCAAATTTGTCATCGAATGAGAATG 222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:1 MD:Z:54A26 AS:i:76 XS:i:0 ME:Z:Gypsy_Gypsy26_chr2_1987286 +read019 16 chr1 3646 60 100M * 0 0 ACGAAACATTGTATTTTTCTTGTTTTGAAAGGATAGAAAAGAGAATTTTTTATTTTTTAATTATTACATATATTTTTTCTAAATTTTTATTCTTTTATTC 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:3 MD:Z:42T23A6A26 AS:i:85 XS:i:0 ME:Z:Gypsy_Gypsy26_chr15_18793972 +read020 16 chr1 3776 60 100M * 0 0 AACGTTGTCAAAAAATATTTTTTGAAAATATATATTCTTAATTTTTTTTGTTTTTTGCCTTTAAAAACATAACTTTTATATAAATGTAGAATATTTTATA 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:1 MD:Z:2G97 AS:i:97 XS:i:0 ME:Z:Gypsy_Gypsy26_chr18_27801424 +read021 16 chr1 3779 60 100M * 0 0 GTTGTCAAAAAATATTTTTTCAAAATATATTTTCTTAATTTTTTTTGTTTTTTGCCTTTAAAAACATAACTTTTATATAAATGTAGTAGATTTTATATAA 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:4 MD:Z:20G9A55A1T11 AS:i:80 XS:i:0 ME:Z:Gypsy_Gypsy26_chr8_5114633 +read022 16 chr1 3800 60 100M * 0 0 AAAATATATATTCTTAATTTTGTTTGTTTTTTGCCTTGAAAAACATTACTTGTATATAAATGTAGAATATTTTATATAAAAAAATAACTTTATTTTATCT 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:4 MD:Z:21T15T8A4T48 AS:i:80 XS:i:0 ME:Z:Gypsy_Gypsy26_chr8_5114633 +read023 0 chr1 21183 60 100M * 0 0 GAGCACTAAAAGTTTTAAGCTCTTTTGAAATCAAATTTTTATAAATGTTAACCTGGTTTTGAAGAAAATGTTTAAGATTTAAAAATGATTTTAACATTAA 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:0 MD:Z:100 AS:i:100 XS:i:0 ME:Z:PIF-Harbinger_Harbinger-3N3_chr16_20723579 +read024 0 chr1 21209 60 100M * 0 0 GAAATCAAATTTATATAAATGTTAACCTGGTTTTGAAGAAAATGTTTAAGATTTAAAAATGATTTTAAGATTAAAAAAAAGAAGTAAAAAATGATACTTA 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:2 MD:Z:12T55C31 AS:i:90 XS:i:0 ME:Z:PIF-Harbinger_Harbinger-3_chr2_4407914 +read025 0 chr1 21336 60 100M * 0 0 CATATATATATATATATATATATATATATTAATTTATATAATTTTTTTAATATTTAATAAAAAATTTATCATTATTATTTTTATCATGATTTTAATTAAA 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:0 MD:Z:100 AS:i:100 XS:i:29 ME:Z:PIF-Harbinger_Harbinger-3N3_chr16_20723579 +read026 0 chr1 21349 60 100M * 0 0 ATATATATATATATATTAATTTATATAATTTTTTTAATATTTAATAAAAAATTTATCATTATTATTTTTATCATGATTTTAATTAAATAAAAAAAAATAT 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:1 MD:Z:97G2 AS:i:97 XS:i:0 ME:Z:PIF-Harbinger_Harbinger-3N3_chr16_20723579 +read027 16 chr1 21834 60 100M * 0 0 CTATCAAAAATTATTCAACGAAGTACAAAAGTTTCTACTTTTGCAAAACAACTTCAAATACATCTGAAGATTCGATTGAATGTAATATTATTAAGCAGTA 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:3 MD:Z:3A14G53G27 AS:i:86 XS:i:0 ME:Z:PIF-Harbinger_Harbinger-3N3_chr16_20723579 +read028 16 chr1 21945 60 100M * 0 0 CTACAGGCTCAAATTAAAATTTAAATATGTAATTCTAAGAAAATAGCATAGGATTTTTTTCTTTGAATTTTAATACACACCCACAATACCACATATTGAC 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:3 MD:Z:33A1A9T54 AS:i:85 XS:i:0 ME:Z:PIF-Harbinger_Harbinger-3N3_chr16_20723579 +read029 16 chr1 21968 60 100M * 0 0 AATATGTAATACAAAGAAAATATCAGAGGATTTTTTTCTTTGAATTTTAATACACACACACAATACCACATATTGACCCTTAAAAATCCAGAAAAACCCT 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:3 MD:Z:25T31C20G21 AS:i:85 XS:i:0 ME:Z:PIF-Harbinger_Harbinger-3N3_chr16_20723579 +read030 16 chr1 21982 60 100M * 0 0 AGAAAATATCATAGGATTTTTTTCTTTGAATTTTAATACACACCCACAATACCTCCTATTGACCGTTAAAAATCCAGAAAAACCCTCCCAAATCCCTTGA 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:3 MD:Z:53A1A36A7 AS:i:85 XS:i:21 ME:Z:PIF-Harbinger_Harbinger-3N3_chr16_20723579 +read031 0 chr1 23966 60 100M * 0 0 AGCTTGTGTTGGAGGGCAGATAAGAGTGGAGACTCGAAGCCCATCATCCTCAAATCCCTGTTGGGTTTCAGAGATTTGTCGTGTCAGATTTTGAATAACT 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:4 MD:Z:20G11A24A32G9 AS:i:80 XS:i:25 ME:Z:Gypsy_Gypsy12_chr1_12715223 +read032 0 chr1 24040 60 100M * 0 0 TTTGTCGTGTCAGATTGTGAATAACTGCAAGTGCAACTGCGAATGTGGAAGTGCCTGAATGTTGGAGGAAATCGTAGTTAGTGGTGTGGTTATTATAGAA 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:2 MD:Z:77G13T8 AS:i:90 XS:i:0 ME:Z:Copia_Copia47_chrUn_41864102 +read033 0 chr1 24085 60 100M * 0 0 TGGAAGTGCCTGAATGTTGGAGGAAATCGTAGGTAGTGGTGTGGTTTTTATAGAAGACAGAGTTGGGAAACAGAGGAAGGTTCTGGAAAGAACAAGAAAG 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:0 MD:Z:100 AS:i:100 XS:i:58 ME:Z:Gypsy_Gypsy7_chr4_10302390 +read034 0 chr1 24096 60 100M * 0 0 GAATGTTGGAGGAAATCGTAGGTAGTGGTGTGGTTTTTATAGAAGACAGAGTTGGGAAACAGAGGAAGGTTCTGGCACGAACAAGAAAGAGGAGAATTGG 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:2 MD:Z:75A1A22 AS:i:90 XS:i:0 ME:Z:Gypsy_Gypsy12_chr1_12715223 +read035 0 chr1 24118 60 100M * 0 0 TAGTGGTGTGGTTTTTATAGAAGACAGAGTTGGGAAACAGAGGAAGGTTCTGGAAAGAAGAAGAAAGAGGAGAATTGGGAGTTTGGCTCTGCTGATTTTG 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:1 MD:Z:59C40 AS:i:95 XS:i:48 ME:Z:Gypsy_Gypsy12_chr1_12715223 +read036 0 chr1 24151 60 100M * 0 0 GAAACAGAGGAAGGTTCTGGAAAGAACAAGAAAGCGGAGAATTGGGAGTTTGGCTCTGCTGATTTTGCTCCAAGGGTACCAGAAACTTGTGGAAGATTCT 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:3 MD:Z:34A42T17T4 AS:i:85 XS:i:34 ME:Z:Copia_Copia47_chrUn_41864102 +read037 16 chr1 24742 60 100M * 0 0 AAAGCCACCTGGGCAAACCCACAATACCATATTGTTTGCTATTAAAAACCCAGAAAAACATTCCCAAAACCCTCAATATTTCTAGCAATTTTGACCTCAA 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:2 MD:Z:15C66A17 AS:i:90 XS:i:32 ME:Z:CACTA_EnSpm-5_chr16_660137 +read038 16 chr1 24773 60 100M * 0 0 TTGTTTGCTTTTAAAAACCCAGAAAAACAGTCCCAAAACCCTCAATATTTAAAGCAATTTTGACCTCAATTGTCTTGGGCTTCTTGGGCTCCGGCGGTGG 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:3 MD:Z:9A19T20C49 AS:i:85 XS:i:45 ME:Z:Copia_Copia47_chrUn_41864102 +read039 16 chr1 24787 60 100M * 0 0 AAACCCAGAAAAACATTCCCAAAACCCTCAATATTTCAAGCAATTTTGACCTCAATTGTCTTGGGCTTCTTGGGCTCCGGCGGTGGCAGCTTCTCAACAG 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:0 MD:Z:100 AS:i:100 XS:i:62 ME:Z:Gypsy_Gypsy7_chr4_10302390 +read040 16 chr1 24799 60 100M * 0 0 ACATTCCCAAAACCCTCAATATTTCAAGCAATTTTGACCTCATTTGTCTTGGGCTTCTTGGGCTCCGGCGGTGGCAGCTTCTCAACAGTGACGGTCAGCA 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:1 MD:Z:42A57 AS:i:95 XS:i:63 ME:Z:Gypsy_Gypsy29_chr11_13193899 +read041 16 chr1 24850 51 100M * 0 0 GGCTTCTTGGGCTCCGGCGGTGGCAGCTTCTCAACAGTGACGGTCAGCACCCCATCTTGACAAACCGCGGAGATCTTATCAGTATTCGCATTCTCCGGCA 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:0 MD:Z:100 AS:i:100 XS:i:80 ME:Z:Gypsy_Gypsy7_chr4_10302390 +read042 16 chr1 24854 59 100M * 0 0 TCTTGGGCTCCGGCGGTGGCAGCTTCTCAACAGTGACGGTCAGCACCCCTTCTTGACAAACCGCGGAGATCTTATCAGTATTCGCATTCTCCGGCAGTAC 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:1 MD:Z:49A50 AS:i:95 XS:i:71 ME:Z:Gypsy_Gypsy12_chr1_12715223 +read043 16 chr1 24857 60 100M * 0 0 TGGGCTCCGGCGGTGGCAGCTTCTCAACAGTGACGGTCAGCACCCCATCTTGACAAACAGCGGAGATCTTATCAGTATTCGCATTCTCCGGCAGTACAAA 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:1 MD:Z:58C41 AS:i:95 XS:i:68 ME:Z:Gypsy_Gypsy23_chr15_8310356 +read044 16 chr1 24860 60 100M * 0 0 GCTCCGGCGGTGGCAGCTTCTCAACAGTGACGGTCAGCTCCCCATCTTGACAAACCGCGGAGATCTTTTCAGTATTCGCATTCTCCGGCAGTACAAACTT 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:2 MD:Z:38A28A32 AS:i:90 XS:i:60 ME:Z:Gypsy_Gypsy23_chrUn_38723460 +read045 16 chr1 24872 59 100M * 0 0 GCAGCTTCTCAACTGTGACGGTCAGCACCACATCTTCACAAACCGCGGAGATCTGATCAGTATTCGCATTCTCCGGCAGTACAAACTTCCTCATAAACTT 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:4 MD:Z:13A15C6G17T45 AS:i:80 XS:i:53 ME:Z:Gypsy_Gypsy23_chrUn_38723460 +read046 16 chr1 24877 60 100M * 0 0 TTCTCAACAGTGACGGTCAGCACCCCATCTTCACAAACCGCGGAGTTCTTATCAGTATTCGCATTCTCCGGCAGTACAAACTTCCTCATAAACTTCCCCA 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:2 MD:Z:31G13A54 AS:i:90 XS:i:55 ME:Z:Gypsy_GYVIT1_chr6_13115950 +read047 16 chr1 24894 60 100M * 0 0 CAGCACCCCATCTTGACAAACCGCGGAGATCTTATCAGTATTCGCATTCTCCGGCAGTACAAACTTCCTCATAAACTTCCCCACCCTCCTCTCCATTCGC 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:1 MD:Z:98T1 AS:i:98 XS:i:66 ME:Z:Gypsy_Gypsy23_chrUn_38723460 +read048 16 chr1 24895 57 100M * 0 0 AGCACCCCATCTTGACAAAACGCGGAGATCTTATCAGTATTCGCATTCTCCGGCAGTAAAAACTTCCTCATAAACTTCCCCACCCTCCTCTCCATTCGCA 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:3 MD:Z:19C38C38T2 AS:i:87 XS:i:62 ME:Z:Gypsy_Gypsy12_chr1_12715223 +read049 16 chr1 24910 49 100M * 0 0 CAAACCGCGGAGATCTTATCAGTATTCGCATTCTCCGCCAGTACAAACTTCCTCATAAACTTCCCCACCCTCCTCTCCATTCTCACGTACTTGGCGCCTT 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:1 MD:Z:37G62 AS:i:95 XS:i:75 ME:Z:Gypsy_Gypsy23_chr14_11656393 +read050 16 chr1 24919 47 5S95M * 0 0 CAGCCGAGATCTTATCAGTATTCGCATTCTCCGGCAGTACAAACTTCCTCATAAACTTCCCCACCCTCCTCTCCATTCTCACGTACTTGGCGCCTTCTTT 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:0 MD:Z:95 AS:i:95 XS:i:77 XA:Z:1200001-1300000,-56177,5S95M,4; ME:Z:Gypsy_Gypsy23_chrUn_38723460 diff --git a/tests/data/testB-2017-06-08.bam b/tests/data/testB-2017-06-08.bam new file mode 100644 index 0000000..15ffa17 Binary files /dev/null and b/tests/data/testB-2017-06-08.bam differ diff --git a/tests/data/testB-2017-06-08.bam.bai b/tests/data/testB-2017-06-08.bam.bai new file mode 100644 index 0000000..c5769e1 Binary files /dev/null and b/tests/data/testB-2017-06-08.bam.bai differ diff --git a/tests/data/testB-2017-06-08.sam b/tests/data/testB-2017-06-08.sam new file mode 100644 index 0000000..d879389 --- /dev/null +++ b/tests/data/testB-2017-06-08.sam @@ -0,0 +1,45 @@ +@HD VN:1.3 SO:coordinate +@SQ SN:chr1 LN:25000 +@PG ID:bwa PN:bwa VN:0.7.12-r1044 CL:bwa mem -t 1 +read001 0 chr1 2353 60 100M * 0 0 CACGCAAGCCTCATTCCCTTCTCTCTATGCTTCCTGGCTTACCGTTAATTAAGATTCATGAAAGTGTTAGGGACAATGCCTCTTCTTTAGAACTGGCAAC 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:5 MD:Z:8A19C26G27G0A15 AS:i:75 XS:i:0 ME:Z:Gypsy_Gypsy26_chr15_18793972 +read002 0 chr1 2407 60 100M * 0 0 TGCATGAAAGTGTTAGGGACAATGCGTCTGATTTAGAACTGGCAACGACTTTGCTGGAAAATATCCCATCTGCTTGAGAGGGATTGTCAGGTCGCAAAAG 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:4 MD:Z:25C20A0T25G26 AS:i:80 XS:i:0 ME:Z:Gypsy_Gypsy26_chr15_18793972 +read003 0 chr1 2454 60 100M * 0 0 TCTTTGCTGGAAAATATCCCATCTGCGTGAGAGGGATTGTCAGGTCGCAAAAGGTGAAATTCAGAGAGGATGGCAACTGCACTCATCAACATCACAATGC 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:2 MD:Z:54G26A18 AS:i:90 XS:i:0 ME:Z:Gypsy_Gypsy26_chr15_18793972 +read004 0 chr1 2467 60 100M * 0 0 ATATCCCATCTGCGTGAGAGGGATTGTCAGGTCGCAAAAGGGGAAATTCAGAGAGGATGGCAACTGCAATCATCAACATCACAATGCCGTCATCAAGACC 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:1 MD:Z:87A12 AS:i:95 XS:i:0 ME:Z:Gypsy_Gypsy26_chr15_18793972 +read005 0 chr1 2478 60 100M * 0 0 GCGTGAGAGGGATTGTCAGGTCGGAAAAGGGGAAATTCAGAGAGGATGGCAACTGCAATCATCAACATCACAATGCAGTCATCAAGACCTTTGCACCCCA 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:1 MD:Z:23C76 AS:i:95 XS:i:0 ME:Z:Gypsy_Gypsy26_chr15_18793972 +read006 0 chr1 2478 60 100M * 0 0 TCGTGAGAGGGCTTGTCAGGTCGGAAAAGGGGAAATTCAGAGAGGATGGCAACTGCAATCATCAACATCACAATGCAGTCATCAAGAGCTTTGCACCCCA 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:4 MD:Z:0G10A11C63C12 AS:i:84 XS:i:0 ME:Z:Gypsy_Gypsy26_chr15_18793972 +read007 0 chr1 2779 60 63M * 0 0 TCCGGCAAGCACCTTTCATAAGTTTTCTTTCATTTTTTTATAATTGTTTTTCTTTATCTTGGT 222222222222222222222222222222222222222222222222222222222222222 NM:i:1 MD:Z:55T7 AS:i:58 XS:i:0 ME:Z:Gypsy_Gypsy26_chr15_18793972 +read008 0 chr1 2784 60 58M * 0 0 CAAGCACCTTTCATAAGTTTTCTTTCATTTTTTTATAATTGTTTTACTATTTCTTGGT 2222222222222222222222222222222222222222222222222222222222 NM:i:2 MD:Z:45T2T9 AS:i:48 XS:i:0 ME:Z:Gypsy_Gypsy26_chr15_18793972 +read009 0 chr1 2796 60 46M * 0 0 ATAAGTTTTCTTTCATTTTTTTATAATTGTTTTTCTTTTTCTTGGT 2222222222222222222222222222222222222222222222 NM:i:0 MD:Z:46 AS:i:46 XS:i:0 ME:Z:Gypsy_Gypsy26_chr8_2502854 +read010 0 chr1 2874 60 100M * 0 0 CTAAAATAATGAAATATGTAAGATAAATTCATATCAACTCAATTGGCCTTTGGTGGGACCCATATTCATAATATTTTCTTTGGAAAAAATTCAAGTAAAT 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:5 MD:Z:15T8T12A22A25T13 AS:i:75 XS:i:0 ME:Z:Gypsy_Gypsy26_chr18_27801424 +read011 0 chr1 2925 60 100M * 0 0 GGTGGGACCAATATTGATAATATTTTCTTTGGAAATAATTCAAGTAAATACCATAAAGTGCATGATTGATATTGTTTGATTTTAATCGGTTTTGTAAGAG 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:2 MD:Z:15C80T3 AS:i:91 XS:i:0 ME:Z:Gypsy_Gypsy26_chr8_5114633 +read012 0 chr1 2963 60 100M * 0 0 TTCAAGTAAATACCATAAAGTGCATGATTGATATTGTTTGATTTTAATCGGTTTTGTATGAGATATTTTCTACTTACTATTATGCACTAACCTTGTCTCT 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:1 MD:Z:69A30 AS:i:95 XS:i:0 ME:Z:Gypsy_Gypsy26_chr8_5114633 +read013 0 chr1 2997 60 43M5S * 0 0 TGTTTGATTTTAATCGGTTTTTTATGAGATATTTTATACTTACAATAA 222222222222222222222222222222222222222222222222 NM:i:1 MD:Z:21G21 AS:i:38 XS:i:0 ME:Z:Gypsy_Gypsy26_chr2_1987286 +read014 0 chr1 3039 60 100M * 0 0 CTATTATGCACTACCCTTGTCACTCATAATACCACATTGTTGATACACTATTCATGTCTGGATCATATTCTCTCTCAGTTGCTTTATCAATCTTTCCTCT 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:8 MD:Z:13A7T9A12T12A26G4T8A1 AS:i:63 XS:i:0 ME:Z:Gypsy_Gypsy26_chr18_27801424 +read015 0 chr1 21183 60 100M * 0 0 GAGCACTAAAAGTTTTAAGCTCTTTTGAAATCAAATTTTTATAAATGTTAACCTGGTTTTGAAGAAAATGTTTAAGATTTAAAAATGATTTTAACATTAA 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:0 MD:Z:100 AS:i:100 XS:i:0 ME:Z:PIF-Harbinger_Harbinger-3N3_chr16_20723579 +read016 0 chr1 21209 60 100M * 0 0 GAAATCAAATTTATATAAATGTTAACCTGGTTTTGAAGAAAATGTTTAAGATTTAAAAATGATTTTAAGATTAAAAAAAAGAAGTAAAAAATGATACTTA 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:2 MD:Z:12T55C31 AS:i:90 XS:i:0 ME:Z:PIF-Harbinger_Harbinger-3_chr2_4407914 +read017 0 chr1 21336 60 100M * 0 0 CATATATATATATATATATATATATATATTAATTTATATAATTTTTTTAATATTTAATAAAAAATTTATCATTATTATTTTTATCATGATTTTAATTAAA 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:0 MD:Z:100 AS:i:100 XS:i:29 ME:Z:PIF-Harbinger_Harbinger-3N3_chr16_20723579 +read018 0 chr1 21349 60 100M * 0 0 ATATATATATATATATTAATTTATATAATTTTTTTAATATTTAATAAAAAATTTATCATTATTATTTTTATCATGATTTTAATTAAATAAAAAAAAATAT 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:1 MD:Z:97G2 AS:i:97 XS:i:0 ME:Z:PIF-Harbinger_Harbinger-3N3_chr16_20723579 +read019 16 chr1 21834 60 100M * 0 0 CTATCAAAAATTATTCAACGAAGTACAAAAGTTTCTACTTTTGCAAAACAACTTCAAATACATCTGAAGATTCGATTGAATGTAATATTATTAAGCAGTA 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:3 MD:Z:3A14G53G27 AS:i:86 XS:i:0 ME:Z:PIF-Harbinger_Harbinger-3N3_chr16_20723579 +read020 16 chr1 21834 60 100M * 0 0 CTATCAAAAATTATTCAACGAAGTACAAAAGTTTCTACTTTTGCAAAACAACTTCAAATACATCTGAAGATTCGATTGAATGTAATATTATTAAGCAGTA 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:3 MD:Z:3A14G53G27 AS:i:86 XS:i:0 ME:Z:PIF-Harbinger_Harbinger-3N3_chr16_20723579 +read021 16 chr1 21945 60 100M * 0 0 CTACAGGCTCAAATTAAAATTTAAATATGTAATTCTAAGAAAATAGCATAGGATTTTTTTCTTTGAATTTTAATACACACCCACAATACCACATATTGAC 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:3 MD:Z:33A1A9T54 AS:i:85 XS:i:0 ME:Z:PIF-Harbinger_Harbinger-3N3_chr16_20723579 +read022 16 chr1 21968 60 100M * 0 0 AATATGTAATACAAAGAAAATATCAGAGGATTTTTTTCTTTGAATTTTAATACACACACACAATACCACATATTGACCCTTAAAAATCCAGAAAAACCCT 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:3 MD:Z:25T31C20G21 AS:i:85 XS:i:0 ME:Z:PIF-Harbinger_Harbinger-3N3_chr16_20723579 +read023 16 chr1 21968 60 100M * 0 0 AATATGTAATACAAAGAAAATATCAGAGGATTTTTTTCTTTGAATTTTAATACACACACACAATACCACATATTGACCCTTAAAAATCCAGAAAAACCCT 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:3 MD:Z:25T31C20G21 AS:i:85 XS:i:0 ME:Z:PIF-Harbinger_Harbinger-3N3_chr16_20723579 +read024 16 chr1 21982 60 100M * 0 0 AGAAAATATCATAGGATTTTTTTCTTTGAATTTTAATACACACCCACAATACCTCCTATTGACCGTTAAAAATCCAGAAAAACCCTCCCAAATCCCTTGA 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:3 MD:Z:53A1A36A7 AS:i:85 XS:i:21 ME:Z:PIF-Harbinger_Harbinger-3N3_chr16_20723579 +read025 0 chr1 23966 60 100M * 0 0 AGCTTGTGTTGGAGGGCAGATAAGAGTGGAGACTCGAAGCCCATCATCCTCAAATCCCTGTTGGGTTTCAGAGATTTGTCGTGTCAGATTTTGAATAACT 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:4 MD:Z:20G11A24A32G9 AS:i:80 XS:i:25 ME:Z:Gypsy_Gypsy12_chr1_12715223 +read026 0 chr1 24040 60 100M * 0 0 TTTGTCGTGTCAGATTGTGAATAACTGCAAGTGCAACTGCGAATGTGGAAGTGCCTGAATGTTGGAGGAAATCGTAGTTAGTGGTGTGGTTATTATAGAA 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:2 MD:Z:77G13T8 AS:i:90 XS:i:0 ME:Z:Copia_Copia47_chrUn_41864102 +read027 0 chr1 24085 60 100M * 0 0 TGGAAGTGCCTGAATGTTGGAGGAAATCGTAGGTAGTGGTGTGGTTTTTATAGAAGACAGAGTTGGGAAACAGAGGAAGGTTCTGGAAAGAACAAGAAAG 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:0 MD:Z:100 AS:i:100 XS:i:58 ME:Z:Gypsy_Gypsy7_chr4_10302390 +read028 0 chr1 24085 60 100M * 0 0 TGGAAGTGCCTGAATGTTGGAGGAAATCGTAGGTAGTGGTGTGGTTTTTATAGAAGACAGAGTTGGGAAACAGAGGAAGGTTCTGGAAAGAACAAGAAAG 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:0 MD:Z:100 AS:i:100 XS:i:58 ME:Z:Gypsy_Gypsy7_chr4_10302390 +read029 0 chr1 24085 60 100M * 0 0 TGGAAGTGCCTGAATGTTGGAGGAAATCGTAGGTAGTGGTGTGGTTTTTATAGAAGACAGAGTTGGGAAACAGAGGAAGGTTCTGGAAAGAACAAGAAAG 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:0 MD:Z:100 AS:i:100 XS:i:58 ME:Z:Gypsy_Gypsy7_chr4_10302390 +read030 0 chr1 24096 60 100M * 0 0 GAATGTTGGAGGAAATCGTAGGTAGTGGTGTGGTTTTTATAGAAGACAGAGTTGGGAAACAGAGGAAGGTTCTGGCACGAACAAGAAAGAGGAGAATTGG 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:2 MD:Z:75A1A22 AS:i:90 XS:i:0 ME:Z:Gypsy_Gypsy12_chr1_12715223 +read031 0 chr1 24118 60 100M * 0 0 TAGTGGTGTGGTTTTTATAGAAGACAGAGTTGGGAAACAGAGGAAGGTTCTGGAAAGAAGAAGAAAGAGGAGAATTGGGAGTTTGGCTCTGCTGATTTTG 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:1 MD:Z:59C40 AS:i:95 XS:i:48 ME:Z:Gypsy_Gypsy12_chr1_12715223 +read032 0 chr1 24151 60 100M * 0 0 GAAACAGAGGAAGGTTCTGGAAAGAACAAGAAAGCGGAGAATTGGGAGTTTGGCTCTGCTGATTTTGCTCCAAGGGTACCAGAAACTTGTGGAAGATTCT 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:3 MD:Z:34A42T17T4 AS:i:85 XS:i:34 ME:Z:Copia_Copia47_chrUn_41864102 +read033 16 chr1 24850 51 100M * 0 0 GGCTTCTTGGGCTCCGGCGGTGGCAGCTTCTCAACAGTGACGGTCAGCACCCCATCTTGACAAACCGCGGAGATCTTATCAGTATTCGCATTCTCCGGCA 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:0 MD:Z:100 AS:i:100 XS:i:80 ME:Z:Gypsy_Gypsy7_chr4_10302390 +read034 16 chr1 24854 59 100M * 0 0 TCTTGGGCTCCGGCGGTGGCAGCTTCTCAACAGTGACGGTCAGCACCCCTTCTTGACAAACCGCGGAGATCTTATCAGTATTCGCATTCTCCGGCAGTAC 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:1 MD:Z:49A50 AS:i:95 XS:i:71 ME:Z:Gypsy_Gypsy12_chr1_12715223 +read035 16 chr1 24857 59 100M * 0 0 TGGGCTCCGGCGGTGGCAGCTTCTCAACAGTGACGGTCAGCACCCCATCTTGACAAACAGCGGAGATCTTATCAGTATTCGCATTCTCCGGCAGTACAAA 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:1 MD:Z:58C41 AS:i:95 XS:i:68 ME:Z:Gypsy_Gypsy23_chr15_8310356 +read036 16 chr1 24860 59 100M * 0 0 GCTCCGGCGGTGGCAGCTTCTCAACAGTGACGGTCAGCTCCCCATCTTGACAAACCGCGGAGATCTTTTCAGTATTCGCATTCTCCGGCAGTACAAACTT 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:2 MD:Z:38A28A32 AS:i:90 XS:i:60 ME:Z:Gypsy_Gypsy23_chrUn_38723460 +read037 16 chr1 24872 59 100M * 0 0 GCAGCTTCTCAACTGTGACGGTCAGCACCACATCTTCACAAACCGCGGAGATCTGATCAGTATTCGCATTCTCCGGCAGTACAAACTTCCTCATAAACTT 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:4 MD:Z:13A15C6G17T45 AS:i:80 XS:i:53 ME:Z:Gypsy_Gypsy23_chrUn_38723460 +read038 16 chr1 24877 59 100M * 0 0 TTCTCAACAGTGACGGTCAGCACCCCATCTTCACAAACCGCGGAGTTCTTATCAGTATTCGCATTCTCCGGCAGTACAAACTTCCTCATAAACTTCCCCA 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:2 MD:Z:31G13A54 AS:i:90 XS:i:55 ME:Z:Gypsy_GYVIT1_chr6_13115950 +read039 16 chr1 24894 59 100M * 0 0 CAGCACCCCATCTTGACAAACCGCGGAGATCTTATCAGTATTCGCATTCTCCGGCAGTACAAACTTCCTCATAAACTTCCCCACCCTCCTCTCCATTCGC 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:1 MD:Z:98T1 AS:i:98 XS:i:66 ME:Z:Gypsy_Gypsy23_chrUn_38723460 +read040 16 chr1 24895 57 100M * 0 0 AGCACCCCATCTTGACAAAACGCGGAGATCTTATCAGTATTCGCATTCTCCGGCAGTAAAAACTTCCTCATAAACTTCCCCACCCTCCTCTCCATTCGCA 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:3 MD:Z:19C38C38T2 AS:i:87 XS:i:62 ME:Z:Gypsy_Gypsy12_chr1_12715223 +read041 16 chr1 24910 49 100M * 0 0 CAAACCGCGGAGATCTTATCAGTATTCGCATTCTCCGCCAGTACAAACTTCCTCATAAACTTCCCCACCCTCCTCTCCATTCTCACGTACTTGGCGCCTT 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:1 MD:Z:37G62 AS:i:95 XS:i:75 ME:Z:Gypsy_Gypsy23_chr14_11656393 +read042 16 chr1 24919 47 5S95M * 0 0 CAGCCGAGATCTTATCAGTATTCGCATTCTCCGGCAGTACAAACTTCCTCATAAACTTCCCCACCCTCCTCTCCATTCTCACGTACTTGGCGCCTTCTTT 2222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222222 NM:i:0 MD:Z:95 AS:i:95 XS:i:77 XA:Z:1200001-1300000,-56177,5S95M,4; ME:Z:Gypsy_Gypsy23_chrUn_38723460 diff --git a/tests/test_bamio.py b/tests/test_bamio.py index 4b75650..f7b333c 100644 --- a/tests/test_bamio.py +++ b/tests/test_bamio.py @@ -2,6 +2,24 @@ from tefingerprint import bamio import pytest +import os +DATA_PATH = os.path.dirname(os.path.realpath(__file__)) + '/data/' + + +def test_extract_bam_references(): + query = bamio.extract_bam_references(DATA_PATH + 'testA-2017-06-08.bam') + answer = ['chr1:1-25000'] + assert query == answer + + +def test_extract_multiple_bam_references(): + query = bamio.extract_bam_references(DATA_PATH + 'testA-2017-06-08.bam', + DATA_PATH + 'testB-2017-06-08.bam') + answer = ['chr1:1-25000'] + assert query == answer + + + @pytest.mark.parametrize("string,answer", diff --git a/tests/test_batch.py b/tests/test_batch.py new file mode 100644 index 0000000..81ee9ba --- /dev/null +++ b/tests/test_batch.py @@ -0,0 +1,192 @@ +#! /usr/bin/env python + +import os +import pytest +import numpy as np +import numpy.testing as npt +from tefingerprint import loci +from tefingerprint import batch + +DATA_PATH = os.path.dirname(os.path.realpath(__file__)) + '/data/' + + +class TestFingerprint: + """Integration tests for batch fingerprinting""" + CORES = 1 + + @pytest.mark.parametrize("parameters, answer", + [({'bams': [DATA_PATH + 'testB-2017-06-08.bam'], + 'categories':['Gypsy', 'PIF'], + 'references':['chr1'], + 'quality':30, + 'transposon_tag':'ME', + 'min_reads':5, + 'eps':250, + 'min_eps':0, + 'hierarchical':True}, + {('chr1:1-25000', '+', 'PIF', 'testB-2017-06-08.bam'): + np.array([], + dtype=loci.FingerPrint._DTYPE_LOCI), + ('chr1:1-25000', '+', 'Gypsy', 'testB-2017-06-08.bam'): + np.array([(2452, 2577), (2841, 3138), (24065, 24217)], + dtype=loci.FingerPrint._DTYPE_LOCI), + ('chr1:1-25000', '-', 'Gypsy', 'testB-2017-06-08.bam'): + np.array([(24850, 24919)], + dtype=loci.FingerPrint._DTYPE_LOCI), + ('chr1:1-25000', '-', 'PIF', 'testB-2017-06-08.bam'): + np.array([(21834, 21982)], + dtype=loci.FingerPrint._DTYPE_LOCI)}), + ({'bams': [DATA_PATH + 'testB-2017-06-08.bam'], + 'categories': ['Gypsy', 'PIF'], + 'references': ['chr1'], + 'quality': 60, + 'transposon_tag': 'ME', + 'min_reads': 5, + 'eps': 250, + 'min_eps': 0, + 'hierarchical': True}, + {('chr1:1-25000', '+', 'PIF', 'testB-2017-06-08.bam'): + np.array([], + dtype=loci.FingerPrint._DTYPE_LOCI), + ('chr1:1-25000', '+', 'Gypsy', 'testB-2017-06-08.bam'): + np.array([(2452, 2577), (2841, 3138), (24065, 24217)], + dtype=loci.FingerPrint._DTYPE_LOCI), + ('chr1:1-25000', '-', 'Gypsy', 'testB-2017-06-08.bam'): + np.array([], + dtype=loci.FingerPrint._DTYPE_LOCI), + ('chr1:1-25000', '-', 'PIF', 'testB-2017-06-08.bam'): + np.array([(21834, 21982)], + dtype=loci.FingerPrint._DTYPE_LOCI)}), + ({'bams': [DATA_PATH + 'testB-2017-06-08.bam'], + 'categories': ['INVALID'], + 'references': ['chr1'], + 'quality': 30, + 'transposon_tag': 'ME', + 'min_reads': 5, + 'eps': 250, + 'min_eps': 0, + 'hierarchical': True}, + {('chr1:1-25000', '+', 'INVALID', 'testB-2017-06-08.bam'): + np.array([], + dtype=loci.FingerPrint._DTYPE_LOCI), + ('chr1:1-25000', '-', 'INVALID', 'testB-2017-06-08.bam'): + np.array([], + dtype=loci.FingerPrint._DTYPE_LOCI)}) + ]) + def test_fingerprint(self, parameters, answer): + parameters['cores'] = self.CORES + query = batch.fingerprint(**parameters) + answer = loci.FingerPrint.from_dict(answer) + + assert set(query.keys()) == set(answer.keys()) + for key in query.keys(): + npt.assert_array_equal(query[key], answer[key]) + + +class TestFingerprintMulticore(TestFingerprint): + """Multi-core versions of integration tests for batch fingerprinting""" + CORES = 2 + + +class TestComparison: + """Integration tests for batch fingerprinting""" + CORES = 1 + + @pytest.mark.parametrize("parameters, answer", + [({'bams': [DATA_PATH + 'testA-2017-06-08.bam', + DATA_PATH + 'testB-2017-06-08.bam'], + 'categories': ['Gypsy', 'PIF'], + 'references': ['chr1'], + 'quality': 30, + 'transposon_tag': 'ME', + 'min_reads': 5, + 'eps': 250, + 'min_eps': 0, + 'hierarchical': True, + 'fingerprint_buffer': 50, + 'bin_buffer': 0}, + {('chr1:1-25000', '-', 'PIF'): np.array([(21784, 22032, + ['testA-2017-06-08.bam', + 'testB-2017-06-08.bam'], + [4, 6], + [0.4, 0.6])], + dtype=loci.Comparison._DTYPE_LOCI), + ('chr1:1-25000', '-', 'Gypsy'): np.array([(24737, 24969, + ['testA-2017-06-08.bam', + 'testB-2017-06-08.bam'], + [12, 10], + [0.545, 0.455])], + dtype=loci.Comparison._DTYPE_LOCI), + ('chr1:1-25000', '+', 'Gypsy'): np.array([(2402, 2627, + ['testA-2017-06-08.bam', + 'testB-2017-06-08.bam'], + [6, 6], + [0.5, 0.5]), + (2791, 3188, + ['testA-2017-06-08.bam', + 'testB-2017-06-08.bam'], + [8, 8], + [0.5, 0.5]), + (24015, 24267, + ['testA-2017-06-08.bam', + 'testB-2017-06-08.bam'], + [4, 6], + [0.4, 0.6])], + dtype=loci.Comparison._DTYPE_LOCI), + ('chr1:1-25000', '+', 'PIF'): np.array([], + dtype=loci.Comparison._DTYPE_LOCI)}), + ({'bams': [DATA_PATH + 'testA-2017-06-08.bam', + DATA_PATH + 'testB-2017-06-08.bam'], + 'categories': ['Gypsy', 'PIF'], + 'references': ['chr1'], + 'quality': 60, + 'transposon_tag': 'ME', + 'min_reads': 5, + 'eps': 250, + 'min_eps': 0, + 'hierarchical': True, + 'fingerprint_buffer': 50, + 'bin_buffer': 0}, + {('chr1:1-25000', '-', 'PIF'): np.array([(21784, 22032, + ['testA-2017-06-08.bam', + 'testB-2017-06-08.bam'], + [4, 6], + [0.4, 0.6])], + dtype=loci.Comparison._DTYPE_LOCI), + ('chr1:1-25000', '-', 'Gypsy'): np.array([(24737, 24944, + ['testA-2017-06-08.bam', + 'testB-2017-06-08.bam'], + [6, 0], + [1.0, 0.0])], + dtype=loci.Comparison._DTYPE_LOCI), + ('chr1:1-25000', '+', 'Gypsy'): np.array([(2402, 2627, + ['testA-2017-06-08.bam', + 'testB-2017-06-08.bam'], + [6, 6], [0.5, 0.5]), + (2791, 3188, + ['testA-2017-06-08.bam', + 'testB-2017-06-08.bam'], + [8, 8], + [0.5, 0.5]), + (24015, 24267, + ['testA-2017-06-08.bam', + 'testB-2017-06-08.bam'], + [4, 6], + [0.4, 0.6])], + dtype=loci.Comparison._DTYPE_LOCI), + ('chr1:1-25000', '+', 'PIF'): np.array([], + dtype=loci.Comparison._DTYPE_LOCI)}) + ]) + def test_comparison(self, parameters, answer): + parameters['cores'] = self.CORES + query = batch.comparison(**parameters) + answer = loci.Comparison.from_dict(answer) + + assert set(query.keys()) == set(answer.keys()) + for key in query.keys(): + npt.assert_array_equal(query[key], answer[key]) + + +class TestComparisonMulticore(TestComparison): + """Multi-core versions of integration tests for batch fingerprinting""" + CORES = 2 diff --git a/tests/test_loci.py b/tests/test_loci.py index 1191f19..91415ea 100644 --- a/tests/test_loci.py +++ b/tests/test_loci.py @@ -1,5 +1,6 @@ #! /usr/bin/env python +import os import pytest import numpy as np import numpy.testing as npt @@ -234,7 +235,23 @@ class TestReadLoci: """Tests for class ReadLoci""" def test_from_bam(self): """""" - pass + data_path = os.path.dirname(os.path.realpath(__file__)) + '/data/testA-2017-06-08.bam' + query = loci.ReadLoci.from_bam(data_path, categories=['CACTA', 'Copia']) + + dictionary = {('chr1:1-25000', '+', 'Copia', 'testA-2017-06-08.bam'): np.array([(24040, 24139, 'read032'), + (24151, 24250, 'read036')], + dtype=loci.ReadLoci._DTYPE_LOCI), + ('chr1:1-25000', '-', 'Copia', 'testA-2017-06-08.bam'): np.array([(24773, 24872, 'read038')], + dtype=loci.ReadLoci._DTYPE_LOCI), + ('chr1:1-25000', '-', 'CACTA', 'testA-2017-06-08.bam'): np.array([(24742, 24841, 'read037')], + dtype=loci.ReadLoci._DTYPE_LOCI), + ('chr1:1-25000', '+', 'CACTA', 'testA-2017-06-08.bam'): np.array([], + dtype=loci.ReadLoci._DTYPE_LOCI)} + answer = loci.ReadLoci.from_dict(dictionary) + + assert set(query.keys()) == set(answer.keys()) + for key in query.keys(): + npt.assert_array_equal(query[key], answer[key]) def test_tips(self): """Test for method to extract the tips of loci as a dict""" @@ -508,6 +525,48 @@ def test_as_gff(self): assert query == answer + def test_as_csv(self): + """""" + query = loci.FingerPrint.from_dict({('chr1:1-3000', + '+', + 'Gypsy', + 'bam1'): np.array([(0, 577), + (879, 1234), + (1662, 1917)], dtype=loci.FingerPrint._DTYPE_LOCI), + ('chr1:1-3000', + '-', + 'Gypsy', + 'bam1'): np.array([(20, 570), + (870, 1230), + (1662, 1917)], dtype=loci.FingerPrint._DTYPE_LOCI), + ('chr2:1-4000', + '+', + 'Gypsy', + 'bam1'): np.array([(0, 577), + (879, 1234), + (1662, 1917)], dtype=loci.FingerPrint._DTYPE_LOCI), + ('chr2:1-4000', + '-', + 'Gypsy', + 'bam1'): np.array([], dtype=loci.FingerPrint._DTYPE_LOCI) + }) + query = query.as_csv().splitlines() + query.sort() + + answer = ['reference,strand,category,source,start,stop', + '"chr1","+","Gypsy","bam1",0,577', + '"chr1","-","Gypsy","bam1",20,570', + '"chr1","-","Gypsy","bam1",870,1230', + '"chr1","+","Gypsy","bam1",879,1234', + '"chr1","-","Gypsy","bam1",1662,1917', + '"chr1","+","Gypsy","bam1",1662,1917', + '"chr2","+","Gypsy","bam1",0,577', + '"chr2","+","Gypsy","bam1",879,1234', + '"chr2","+","Gypsy","bam1",1662,1917'] + answer.sort() + + assert query == answer + class TestComparativeBins: """Tests for class ComparativeBins""" @@ -780,3 +839,52 @@ def test_as_flat_gff(self): answer.sort() assert query == answer + + def test_as_flat_csv(self): + """""" + query = {('chr1:1-3000', '+', 'Copia'): np.array([(0, 20, ['bam1', 'bam2'], [3, 0], [1.0, 0.0]), + (21, 30, ['bam1', 'bam2'], [5, 0], [1.0, 0.0]), + (35, 40, ['bam1', 'bam2'], [3, 0], [1.0, 0.0])], + dtype=loci.Comparison._DTYPE_LOCI), + ('chr1:1-3000', '+', 'Gypsy'): np.array([(0, 20, ['bam1', 'bam2'], [3, 1], [0.75, 0.25]), + (21, 30, ['bam1', 'bam2'], [5, 5], [0.5, 0.5]), + (35, 40, ['bam1', 'bam2'], [3, 0], [1.0, 0.0])], + dtype=loci.Comparison._DTYPE_LOCI)} + query = loci.Comparison.from_dict(query) + query = query.as_flat_csv().splitlines() + query.sort() + + answer = ['reference,strand,category,start,stop,source,count,proportion', + '"chr1","+","Copia",0,20,"bam1",3,1.0', + '"chr1","+","Copia",0,20,"bam2",0,0.0', + '"chr1","+","Gypsy",0,20,"bam1",3,0.75', + '"chr1","+","Gypsy",0,20,"bam2",1,0.25', + '"chr1","+","Copia",21,30,"bam1",5,1.0', + '"chr1","+","Copia",21,30,"bam2",0,0.0', + '"chr1","+","Gypsy",21,30,"bam1",5,0.5', + '"chr1","+","Gypsy",21,30,"bam2",5,0.5', + '"chr1","+","Copia",35,40,"bam1",3,1.0', + '"chr1","+","Copia",35,40,"bam2",0,0.0', + '"chr1","+","Gypsy",35,40,"bam1",3,1.0', + '"chr1","+","Gypsy",35,40,"bam2",0,0.0'] + answer.sort() + + def test_as_character_csv(self): + """""" + query = {('chr1:1-3000', '+', 'Copia'): np.array([(0, 20, ['bam1', 'bam2'], [3, 0], [1.0, 0.0]), + (21, 30, ['bam1', 'bam2'], [5, 0], [1.0, 0.0]), + (35, 40, ['bam1', 'bam2'], [3, 0], [1.0, 0.0])], + dtype=loci.Comparison._DTYPE_LOCI), + ('chr1:1-3000', '+', 'Gypsy'): np.array([(0, 20, ['bam1', 'bam2'], [3, 1], [0.75, 0.25]), + (21, 30, ['bam1', 'bam2'], [5, 5], [0.5, 0.5]), + (35, 40, ['bam1', 'bam2'], [3, 0], [1.0, 0.0])], + dtype=loci.Comparison._DTYPE_LOCI)} + query = loci.Comparison.from_dict(query) + query = query.as_character_csv() + + + answer = '\n'.join(['"sample","chr1:0-20_+_Copia","chr1:0-20_+_Gypsy","chr1:21-30_+_Copia","chr1:21-30_+_Gypsy","chr1:35-40_+_Copia","chr1:35-40_+_Gypsy"', + 'bam1, 3,3,5,5,3,3', + 'bam2, 0,1,0,5,0,0']) + + assert query == answer