diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index 18485f2..ed237f7 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -43,7 +43,9 @@ jobs: run: pycodestyle . - name: Run unit tests - run: coverage run -m unittest && coverage lcov + run: | + export NUMBA_DISABLE_JIT=1 + coverage run -m unittest && coverage lcov - name: Coveralls uses: coverallsapp/github-action@v2 diff --git a/CHANGELOG.md b/CHANGELOG.md index ee911bf..6e8a770 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,9 +1,13 @@ # Change Log -## Version 0.1.6-dev +## Version 0.1.7-dev + +### Added +- Formally adopted the NumPy + Numba solution in the ordinal mapper. This significantly accelerated the algorithm ([#209](https://github.com/qiyunzhu/woltka/pull/209)). ### Changed - Changed default output subject coverage (`--outcov`) coordinates into BED-like (0-based, exclusive end). The output can be directly parsed by programs like bedtools. Also added support for GFF-like and other custom formats, as controled by paramter `--outcov-fmt` ([#204](https://github.com/qiyunzhu/woltka/pull/204) and [#205](https://github.com/qiyunzhu/woltka/pull/205)). +- Default chunk size is now 1024 for plain and range mapeprs, and 2 ** 20 = 1048576 for ordinal mapper. The latter represents the number of valid query-subject pairs ([#209](https://github.com/qiyunzhu/woltka/pull/209)). ## Version 0.1.6 (2/22/2024) diff --git a/ci/conda_requirements.txt b/ci/conda_requirements.txt index c6dd7e8..98c9870 100644 --- a/ci/conda_requirements.txt +++ b/ci/conda_requirements.txt @@ -1,2 +1,3 @@ -cython +numpy +numba biom-format diff --git a/doc/cli.md b/doc/cli.md index b44272c..796ad1d 100644 --- a/doc/cli.md +++ b/doc/cli.md @@ -109,8 +109,8 @@ Option | Description Option | Description --- | --- -`--chunk` | Number of unique queries to read and parse in each chunk of an alignment file. Default: 1,000 for plain or range mapping, or 1,000,000 for ordinal mapping. -`--cache` | Number of recent classification results to cache for faster subsequent classifications. Default: 1024. +`--chunk` | Number of unique queries to read and parse in each chunk of an alignment file. Default: 1,024 for plain or range mapping, or 2 ** 20 = 1,048,576 for ordinal mapping. The latter cannot exceed 2 ** 22. +`--cache` | Number of recent classification results to cache for faster subsequent classifications. Default: 1,024. `--no-exe` | Disable calling external programs (`gzip`, `bzip2` and `xz`) for decompression. Otherwise, Woltka will use them if available for faster processing, or switch back to Python if not. diff --git a/doc/perform.md b/doc/perform.md index fbeab5c..be5c620 100644 --- a/doc/perform.md +++ b/doc/perform.md @@ -62,7 +62,7 @@ Simple read-gene matching, with Numba [acceleration](install.md#acceleration) | Two Woltka parameters visibly impacts Woltka's speed: -- `--chunk` | Number of unique queries to read and parse in each chunk of an alignment file. Default: 1,000 for plain or range mapping, or 1,000,000 for ordinal mapping. +- `--chunk` | Number of unique queries to read and parse in each chunk of an alignment file. Default: 1,024 for plain or range mapping, or 2 ** 20 = 1,048,576 for ordinal mapping. The latter cannot exceed 2 ** 22. - `--cache` | Number of recent classification results to cache for faster subsequent classifications. Default: 1024. Their default values were set based on our experience. However, alternative values could improve (or reduce) performance depending on the computer hardware, input file type, and database capacity. if you plan to routinely process bulks of biological big data using the same setting, we recommend that you do a few test runs on a small dataset and find out the values that work the best for you. diff --git a/woltka/align.py b/woltka/align.py index e1f804c..c57c883 100644 --- a/woltka/align.py +++ b/woltka/align.py @@ -8,7 +8,6 @@ # The full license is in the file LICENSE, distributed with this software. # ---------------------------------------------------------------------------- - """Functions for parsing alignment / mapping files. Notes @@ -45,7 +44,7 @@ from functools import lru_cache -def plain_mapper(fh, fmt=None, excl=None, n=1000): +def plain_mapper(fh, fmt=None, excl=None, n=1024): """Read an alignment file in chunks and yield query-to-subject(s) maps. Parameters @@ -619,59 +618,6 @@ def cigar_to_lens_ord(cigar): return align, align + offset -def parse_sam_file_pd(fh, n=65536): - """Parse a SAM file (sam) using Pandas. - - Parameters - ---------- - fh : file handle - SAM file to parse. - n : int, optional - Chunk size. - - Yields - ------ - None - - Notes - ----- - This is a SAM file parser using Pandas. It is slower than the current - parser. The `read_csv` is fast, but the data frame manipulation slows - down the process. It is here for reference only. - """ - return - # with pd.read_csv(fp, sep='\t', - # header=None, - # comment='@', - # na_values='*', - # usecols=[0, 1, 2, 3, 5], - # names=['qname', 'flag', 'rname', 'pos', 'cigar'], - # dtype={'qname': str, - # 'flag': np.uint16, - # 'rname': str, - # 'pos': int, - # 'cigar': str}, - # chunksize=n) as reader: - # for chunk in reader: - # chunk.dropna(subset=['rname'], inplace=True) - # # this is slow, because of function all - # chunk['length'], offset = zip(*chunk['cigar'].apply( - # cigar_to_lens)) - # chunk['right'] = chunk['pos'] + offset - # # this is slow, because of function all - # # chunk['qname'] = chunk[['qname', 'flag']].apply( - # # qname_by_flag, axis=1) - # # this is a faster method - # chunk['qname'] += np.where( - # chunk['qname'].str[-2:].isin(('/1', '/2')), '', - # np.where(np.bitwise_and(chunk['flag'], (1 << 6)), '/1', - # np.where(np.bitwise_and(chunk['flag'], (1 << 7)), - # '/2', ''))) - # chunk['score'] = 0 - # yield from chunk[['qname', 'rname', 'score', 'length', - # 'pos', 'right']].values - - def parse_map_file(fh, *args): """Parse a simple mapping file. diff --git a/woltka/biom.py b/woltka/biom.py index e50e2dc..21fae89 100644 --- a/woltka/biom.py +++ b/woltka/biom.py @@ -164,8 +164,8 @@ def round_biom(table: biom.Table, digits=0): digits : int, optional Digits after the decimal point. - Notes - ----- + Examples + -------- There is a fully vectorized, much faster alternate: >>> arr = table.matrix_data.data diff --git a/woltka/ordinal.py b/woltka/ordinal.py index 931784a..439d6ee 100644 --- a/woltka/ordinal.py +++ b/woltka/ordinal.py @@ -9,16 +9,162 @@ # ---------------------------------------------------------------------------- """Functions for matching reads and genes using an ordinal system. + +This module matches reads and genes based on their coordinates in the genome. +It uses an efficient algorithm, in which starting and ending positions of reads +and genes are flattened and sorted by coordinate into one sequence, followed by +one iteration over the sequence to identify all read-gene overlaps. + +This algorithm was inspired by the sweep line algorithm [1] in computational +geometry. + + [1] Shamos, Michael Ian, and Dan Hoey. "Geometric intersection problems." + 17th Annual Symposium on Foundations of Computer Science (sfcs 1976). + IEEE, 1976. + +In this sequence, each position contains 4 pieces of information: + + 0. Coordinate (nt). + 1. Whether start or end. + 2. Whether gene or read. + 3. Index of gene / read. + +For efficient computing, the information is stored as a 64-bit signed integer +(int64), and accessed using bitwise operations. The bits are defined as (from +right to left): + + Bits 1-22 (22): Index of gene / read (max.: 4,194,303). + Bits 23 (1): Whether it is a read (0) or a gene (1). + Bits 24 (1): Whether it is a start (0) or an end (1). + Bits 25-63 (39): Coordinate (max.: 549,755,813,887). + +To construct a position: + + code = (nt << 24) + is_gene * (1 << 23) + is_end * (1 << 22) + idx + +To extract information from a position: + + Index: code & (1 << 22) - 1 + Gene or read: code & (1 << 22) + End or start: code & (1 << 23) + Coordinate: code >> 24 + +Rationales of this design: + + Index (max. = 4 million): It is larger than the largest number of protein- + coding genes in a genome (Trichomonas vaginalis, ca. 60,000). On the other + side, the input chunk size should be no more than 4 million, otherwise it + is possible that reads mapped to a single genome exceed the upper limit of + indices. + + Read (0) or gene (1): Adding one bit costs some compute. It should be done + during gene queue construction, which only happens during database loading, + instead of during read queue construction, which happens for every chunk of + input data. + + Start (0) or end (1): This ensures that the points (uint64) can be directly + sorted in ascending order, such that start always occurs before end, even + if their coordinates are equal (i.e., the gene/read has a length of 1 bp). + + Coordinate (max. = 550 billion): It is larger than the largest known genome + (Tmesipteris oblanceolata, 160 Gbp). Therefore it should be sufficient for + the desired application. + +Notes: + + If data type is uint64 instead of int64, the maximum coordinate can be 2x + as large. However, NumPy does not allow bitwise operations on uint64. + + To remove the upper limit of coordinate, one may remove `dtype=np.int64` + from the code. This will slightly slow down the code even if no coordinate + exceeds the upper limit, and it will notably reduce efficiency when some + coordinates exceed the upper limit, because the array will downgrade to + data type to 'O' (Python object), which is inefficient. """ from collections import defaultdict -from itertools import chain -from bisect import bisect + +import numpy as np +from numba import njit +from numba.typed import Dict +from numba.types import uint32, int64, boolean +from numba import config from .align import iter_align +nojit = config.DISABLE_JIT + -def ordinal_mapper(fh, coords, idmap, fmt=None, excl=None, n=1000000, th=0.8, +def match_read_gene_dummy(queue, lens, th): + """Associate reads with genes based on a sorted queue of coordinates. + + Parameters + ---------- + queue : list of tuple of (int, bool, bool, int) + Sorted queue of coordinates (location, start or end, gene or read, + index). + lens : dict + Read-to-alignment length map. + th : float + Threshold for read/gene overlapping fraction. + + Yields + ------ + int + Read index. + int + Gene index. + + See Also + -------- + match_read_gene + .tests.test_ordinal.OrdinalTests.test_match_read_gene_dummy + + Notes + ----- + This is a dummy function which is only for test and demonstration purpose, + but is not called anywhere in the program. + + The formal function `match_read_gene` extensively uses bitwise operations + and thus is hard to read. Therefore the current function, which represents + the original form of prior to optimization, is retained. + """ + genes = {} # current genes + reads = {} # current reads + + # walk through flattened queue of reads and genes + for loc, is_end, is_gene, idx in queue: + if is_gene: + + # when a gene starts, added to gene cache + if not is_end: + genes[idx] = loc + + # when a gene ends, + else: + + # find gene start and remove it from cache + gloc = genes.pop(idx) + + # check cached reads for matches + for rid, rloc in reads.items(): + + # is a match if read/gene overlap is long enough + if loc - max(gloc, rloc) >= lens[rid] * th: + yield rid, idx + + # the same for reads + else: + if not is_end: + reads[idx] = loc + else: + rloc = reads.pop(idx) + for gid, gloc in genes.items(): + if loc - max(rloc, gloc) >= lens[idx] * th: + yield idx, gid + + +def ordinal_mapper(fh, coords, idmap, fmt=None, excl=None, n=2**20, th=0.8, prefix=False): """Read an alignment file and match reads and genes in an ordinal system. @@ -35,7 +181,7 @@ def ordinal_mapper(fh, coords, idmap, fmt=None, excl=None, n=1000000, th=0.8, excl : set, optional Subjects to exclude. n : int, optional - Number of unique queries per chunk. + Number of reads per chunk (max. 2 ** 22 = 4,194,304). th : float Minimum threshold of overlap length : alignment length for a match. prefix : bool @@ -47,161 +193,136 @@ def ordinal_mapper(fh, coords, idmap, fmt=None, excl=None, n=1000000, th=0.8, Yields ------ - tuple of str + list of str Query queue. - dict of set of str + list of set of str Subject(s) queue. """ it = iter_align(fh, fmt, excl, True) - # cached list of query Ids for reverse look-up + # arguments for flush_chunk + args = (coords, idmap, th, prefix) + + # cached lists of read Ids and lengths (pre-allocate space) # gene Ids are unique, but read Ids can have duplicates (i.e., one read is # mapped to multiple loci on a genome), therefore an incremental integer # here replaces the original read Id as its identifer - rids = [] - rid_append = rids.append + rids = [None] * n + lens = np.empty((n,), dtype=np.uint32) - # cached map of read to coordinates + # cached map of reads to per-genome coordinates locmap = defaultdict(list) - def flush(): - """Match reads in current chunk with genes from all nucleotides. - - Returns - ------- - tuple of str - Query queue. - dict of set of str - Subject(s) queue. - """ - # master read-to-gene(s) map - res = defaultdict(set) - - # iterate over nucleotides - for nucl, locs in locmap.items(): - - # it's possible that no gene was annotated on the nucleotide - try: - glocs = coords[nucl] - except KeyError: - continue - - # get reference to gene identifiers - gids = idmap[nucl] - - # append prefix if needed - pfx = nucl + '_' if prefix else '' - - # execute ordinal algorithm when reads are many - # 8 (5+ reads) is an empirically determined cutoff - if len(locs) > 8: - - # merge and sort coordinates - # question is to add unsorted read coordinates into pre-sorted - # gene coordinates - # Python's Timsort algorithm is efficient for this task - queue = sorted(chain(glocs, locs)) - - # map reads to genes using the core algorithm - for read, gene in match_read_gene(queue): - - # add read-gene pairs to the master map - res[rids[read]].add(pfx + gids[gene]) - - # execute naive algorithm when reads are few - else: - for read, gene in match_read_gene_quart(glocs, locs): - res[rids[read]].add(pfx + gids[gene]) - - # return matching read Ids and gene Ids - return res.keys(), res.values() - - # target query count at end of current chunk - target = n + # current read index in the cached lists; will reset after each flush + idx = 0 # parse alignment file - for i, (query, records) in enumerate(it): + for query, records in it: - # when chunk limit is reached - if i == target: + # exclude hits with unavailable or zero length + records = [x for x in records if x[2]] - # flush: match currently cached reads with genes and yield - yield flush() - - # re-initiate read Ids, length map and location map - rids = [] - rid_append = rids.append + # when chunk limit is about to be exceeded by the next query, match + # currently cached reads with genes, flush, and reset + if idx + len(records) > n: + yield flush_chunk(idx, locmap, rids, lens, *args) locmap = defaultdict(list) + idx = 0 - # next target line number - target += n - - # extract alignment info + # extract alignment info and add to cache for subject, _, length, beg, end in records: - - # skip if length is not available or zero - if not length: - continue - - # append read Id, alignment length and location - idx = len(rids) - rid_append(query) - - # effective length = length * th - # -int(-x // 1) is equivalent to math.ceil(x) but faster - # this value must be >= 1 + rids[idx] = query + lens[idx] = length locmap[subject].extend(( - (beg << 48) + (-int(-length * th // 1) << 31) + idx, - (end << 48) + idx)) + (beg << 24) + idx, + (end << 24) + (1 << 23) + idx)) + idx += 1 # final flush - yield flush() + yield flush_chunk(idx, locmap, rids, lens, *args) -def ordinal_parser_dummy(fh, parser): - """Alignment parsing functionalities stripped from for `ordinal_mapper`. +def flush_chunk(n, rlocmap, rids, rlens, glocmap, gidmap, th, prefix): + """Match reads in current chunk with genes from all genomes. Parameters ---------- - fh : file handle - Alignment file to parse. - parser : callable - Function to parse alignment lines of certain format. + n : int + Number of reads to flush. + rlocmap : dict of list + Read coordinates per genome. + rids : list of str + Read IDs. + rlens : np.array(-1, dtype=int64) + Read lengths. + glocmap : dict of list + Gene coordinates per genome. + gidmap : dict of list + Gene identifiers. + th : float + Length threshold. + prefix : bool + Prefix gene IDs with nucleotide IDs. Returns ------- list of str - Read Ids in same order as in alignment file. - defaultdict of dict of int - Map of read indices to alignment lengths per nucleotide. - defaultdict of list of (int, bool, bool, str) - Flattened list of read coordinates per nucleotide. + Query queue. + list of set of str + Subject(s) queue. + """ + # master read-to-gene(s) map + res = defaultdict(set) - See Also - -------- - ordinal_mapper - match_read_gene_dummy - .tests.test_ordinal.OrdinalTests.test_ordinal_parser_dummy + # effective length = length * th + rels = np.ceil(rlens[:n] * th).astype(np.uint32) - Notes - ----- - This is a dummy function only for test and demonstration purpose but not - called anywhere in the program. See its unit test for details. - """ - rids = [] - lenmap = defaultdict(dict) - locmap = defaultdict(list) + # iterate over nucleotides + for nucl, rlocs in rlocmap.items(): - for query, records in parser(fh): - for subject, _, length, beg, end in records: - idx = len(rids) - rids.append(query) - lenmap[subject][idx] = length - locmap[subject].extend(( - (beg, True, False, idx), - (end, False, False, idx))) + # it's possible that no gene was annotated on the nucleotide + try: + glocs = glocmap[nucl] + except KeyError: + continue + + # get reference to gene identifiers + gids = gidmap[nucl] + + # append prefix if needed + pfx = nucl + '_' if prefix else '' + + # convert list to array + rlocs = np.array(rlocs, dtype=np.int64) + + # execute ordinal algorithm when reads are many + # 10 (>5 reads) is an empirically determined cutoff + if rlocs.size > 10: + + # merge pre-sorted genes with reads of unknown sorting status + queue = np.concatenate((glocs, rlocs)) + + # sort genes and reads into a mixture + # timsort is efficient for this task + queue.sort(kind='stable') + + # a potentially more efficient method is to use sortednp: + # rlocs.sort(kind='stable') + # queue = sortednp.merge(glocs, rlocs) + + # map reads to genes using the core algorithm + matches = match_read_gene(queue, rels) + + # execute naive algorithm when reads are few + else: + matches = match_read_gene_quart(glocs, rlocs, rels) + + # add read-gene pairs to the master map + for read, gene in matches: + res[rids[read]].add(pfx + gids[gene]) - return rids, lenmap, locmap + # return matching read Ids and gene Ids + return res.keys(), res.values() def load_gene_coords(fh, sort=False): @@ -216,8 +337,8 @@ def load_gene_coords(fh, sort=False): Returns ------- - dict of int - Binarized gene coordinate information per nucleotide. + dict of np.array(-1, dtype=int64) + Binarized gene position information per genome. dict of list of str Gene IDs. bool @@ -226,15 +347,6 @@ def load_gene_coords(fh, sort=False): See Also -------- match_read_gene - - Notes - ----- - This data structure is central to this algorithm. Starting and ending - coordinates of each gene are separated and flattened into a sorted list. - which enables only one round of list traversal for the entire set of genes - plus reads. - - See the docstring of `match_read_gene` for details. """ coords = {} queue_extend = None @@ -247,6 +359,8 @@ def load_gene_coords(fh, sort=False): used = set() used_add = used.add + nucl = None # current nucleotide + for line in fh: # ">" or "#" indicates genome (nucleotide) name @@ -256,29 +370,34 @@ def load_gene_coords(fh, sort=False): # double ">" or "#" indicates genome name, which serves as # a super group of subsequent nucleotide names; to be ignored if line[1] != c0: + + # convert gene queue to np.array + try: + coords[nucl] = encode_genes(coords[nucl]) + except KeyError: + pass + + # re-initiate gene queue nucl = line[1:].strip() coords[nucl] = [] queue_extend = coords[nucl].extend gids = idmap[nucl] = [] gids_append = gids.append + else: - x = line.rstrip().split('\t') - # begin and end positions are based on genome (nucleotide) + # extract Id, start, end try: - beg, end = int(x[1]), int(x[2]) - except (IndexError, ValueError): + gene, beg, end = line.rstrip().split('\t') + except ValueError: raise ValueError( f'Cannot extract coordinates from line: "{line}".') - idx = len(gids) - gene = x[0] + + # add positions to queue + queue_extend((beg, end)) + + # record gene Id gids_append(gene) - if beg > end: - beg, end = end, beg - queue_extend(( - ((beg - 1) << 48) + (3 << 30) + idx, - (end << 48) + (1 << 30) + idx - )) # check duplicate if isdup is None: @@ -287,28 +406,80 @@ def load_gene_coords(fh, sort=False): else: used_add(gene) + # final conversion + try: + coords[nucl] = encode_genes(coords[nucl]) + except KeyError: + raise ValueError('No coordinate was read from file.') + # sort gene coordinates per nucleotide if sort: for queue in coords.values(): - queue.sort() + queue.sort(kind='stable') # timsort return coords, idmap, isdup or False -def match_read_gene(queue): +def encode_genes(lst): + """Encode gene positions into a binary queue. + + Parameters + ---------- + lst : list of int + Flattened list of start and end coordinates. + + Returns + ------- + np.array(-1, dtype=int64) + Encoded gene queue. + """ + try: + arr = np.asarray(lst, dtype=np.int64) + except ValueError: + raise ValueError('Invalid coordinate(s) found.') + + n = arr.size // 2 + idx = np.arange(n) # gene indices + + # separate start (odd) and end (even) positions + beg, end = arr[0::2], arr[1::2] + + # order each pair of start and end coordinates such that smaller one + # comes first + # faster than np.sort since there are only two numbers + # < is slightly faster than np.less + cmp = beg < end + lo = np.where(cmp, beg, end) + hi = np.where(cmp, end, beg) + + # encode coordinate, start/end, is gene, and index into one integer + lo = np.left_shift(lo - 1, 24) + (1 << 22) + idx + hi = np.left_shift(hi, 24) + (3 << 22) + idx + + # fastest way to interleave two arrays + # https://stackoverflow.com/questions/5347065/ + que = np.empty((2 * n,), dtype=np.int64) + que[0::2] = lo + que[1::2] = hi + + return que + + +@njit((int64[:], uint32[:])) +def match_read_gene(queue, rels): """Associate reads with genes based on a sorted queue of coordinates. Parameters ---------- - queue : list of int - Sorted queue of coordinates. + queue : np.array(-1, dtype=int64) + Paired queue of reads. + rels : np.array(-1, dtype=uint32) + Effective lengths of reads. - Yields + Returns ------ - int - Read index. - int - Gene index. + list of tuple of (int, int) + Indices of matched reads and genes. See Also -------- @@ -327,32 +498,21 @@ def match_read_gene(queue): This function is the most compute-intensive step in the entire analysis, therefore it has been deeply optimized to increase performance wherever - possible. Notably, it extensively uses bitwise operations to extract - multiple pieces of information from a single integer. - - Specifically, each coordinate (an integer) has the following information - (from right to left): - - - Bits 1-30: Index of gene / read (30 bits, max.: 1,073,741,823). - - Bits 31: Whether it is a gene (1) or a read (0) (1 bit). - - Bits 32-58: Whether it is the start (positive) or end (0) of a gene / - read. If start, the value represents the effective length of - an alignment if it's a read, or 1 if it's a gene (17 bits, - max.: 131,071). - - Bits 59- : Coordinate (position on the genome, nt) (unlimited) - - The Python code to extract these pieces of information is: - - - Coordinate: `code >> 48` - - Effective length: `code >> 31 & (1 << 17) - 1`, or `code >> 31 & 131071` - - Gene or read: `code & (1 << 30)` - - Gene/read index: `code & (1 << 30) - 1` + possible. Note: Repeated bitwise operations are usually more efficient that a single bitwise operation assigned to a new variable. """ - genes = {} # current genes cache - reads = {} # current reads cache + # matched read-gene pairs + # numba has issues with jitting generator functions + # see https://github.com/numba/numba/issues/6993 + # therefore this function returns rather than yields + matches = [] + matches_append = matches.append + + # caches of current genes and reads + genes = {} if nojit else Dict.empty(key_type=int64, value_type=int64) + reads = {} if nojit else Dict.empty(key_type=int64, value_type=int64) # cache method references genes_items = genes.items @@ -365,80 +525,73 @@ def match_read_gene(queue): for code in queue: # if this is a gene, - if code & (1 << 30): + if code & (1 << 22): - # when a gene begins, - # if code >> 31 & 131071: - if code & (1 << 31): + # when a gene starts, + if not code & (1 << 23): - # add its index and coordinate to cache - genes[code & (1 << 30) - 1] = code >> 48 + # add it to cache + genes[code & (1 << 22) - 1] = code >> 24 # when a gene ends, else: - # find gene start - gloc = genes_pop(code & (1 << 30) - 1) + # find the gene's start + gid = code & (1 << 22) - 1 + gloc = genes_pop(gid) # check cached reads for matches for rid, rloc in reads_items(): # is a match if read/gene overlap is long enough - # code >> 48: read end coordinate - # gloc: gene start coordinate - # rloc >> 17`: read start coordinate - # rloc & 131071`: effective length - if (code >> 48) - max(gloc, rloc >> 17) >= rloc & 131071: - yield rid, code & (1 << 30) - 1 + if (code >> 24) - max(gloc, rloc) >= rels[rid]: + matches_append((rid, gid)) # if this is a read, else: - # when a read begins, - if code >> 31 & 131071: + # when a read starts, + if not code & (1 << 23): - # add its index, coordinate and effective length to cache - # the latter two are stored as a single integer - reads[code & (1 << 30) - 1] = code >> 31 + # add it to cache + reads[code & (1 << 22) - 1] = code >> 24 # when a read ends, else: - # find read start and effective length - rloc = reads_pop(code & (1 << 30) - 1) + # find the read's start + rid = code & (1 << 22) - 1 + rloc = reads_pop(rid) # check cached genes - # a potential optimization is to pre-calculate `rloc >> 17` - # and `(code >> 48) - (rloc & 131071)`, however, it is not - # worth the overhead in real applications because there is - # usually zero or one gene in cache - # another potential optimization is to replace `max` (which is - # a function call with a ternary operator, but one needs the - # first optimization prior to this + # theoretically, an optimization is to loop in reverse order, + # `dropwhile` unmatched, then return all remaining ones for gid, gloc in genes_items(): # same as above - if (code >> 48) - max(gloc, rloc >> 17) >= rloc & 131071: - yield code & (1 << 30) - 1, gid + if (code >> 24) - max(gloc, rloc) >= rels[rid]: + matches_append((rid, gid)) + return matches -def match_read_gene_naive(geneque, readque): +@njit((int64[:], int64[:], uint32[:])) +def match_read_gene_naive(gque, rque, rels): """Associate reads with genes using a naive approach, which performs nested iteration over genes and reads. Parameters ---------- - geneque : list of int + gque : np.array(-1, dtype=int64) Sorted queue of genes. - readque : list of int + rque : np.array(-1, dtype=int64) Paired queue of reads. + rels : np.array(-1, dtype=uint32) + Effective lengths of reads. - Yields + Returns ------ - int - Read index. - int - Gene index. + list of tuple of (int, int) + Indices of matched reads and genes. See Also -------- @@ -452,52 +605,58 @@ def match_read_gene_naive(geneque, readque): method. However, when the number of reads is small, it may be efficient because it saves the sorting step. """ + matches = [] + matches_append = matches.append + # only genes are cached - genes = {} + genes = {} if nojit else Dict.empty(key_type=int64, value_type=int64) genes_pop = genes.pop - # iterate over reads (paired) - it = iter(readque) + # iterate over reads in pairs + # this way is faster than rque.reshape(-1, 2) + it = iter(rque) for x, y in zip(it, it): # pre-calculate read metrics - rid = x & (1 << 30) - 1 # index - beg = x >> 48 # start coordinate - end = y >> 48 # end coordinate - L = x >> 31 & 131071 # effective length + rid = x & (1 << 22) - 1 # index + beg = x >> 24 # start coordinate + end = y >> 24 # end coordinate + L = rels[rid] # effective length - # iterate over genes (ordinal) - for code in geneque: + # iterate over gene positions + for code in gque: - # add gene to cache - if code & (1 << 31): - genes[code & (1 << 30) - 1] = code >> 48 + # at start, add gene to cache + if not code & (1 << 23): + genes[code & (1 << 22) - 1] = code >> 24 - # check overlap while removing gene from cache: - # min(gene end, read end) - max(gene start, read start) >= - # effective length - elif (min(code >> 48, end) - - max(genes_pop(code & (1 << 30) - 1), beg)) >= L: - yield rid, code & (1 << 30) - 1 + # at end, check overlap while removing gene from cache: + # min(gene end, read end) - max(gene start, read start) >= + # effective length + elif (min(code >> 24, end) - + max(genes_pop(code & (1 << 22) - 1), beg)) >= L: + matches_append((rid, code & (1 << 22) - 1)) + return matches -def match_read_gene_quart(geneque, readque): +@njit((int64[:], int64[:], uint32[:])) +def match_read_gene_quart(gque, rque, rels): """Associate reads with genes by iterating between read region and nearer end of gene queue. Parameters ---------- - geneque : list of int + gque : np.array(-1, dtype=int64) Sorted queue of genes. - readque : list of int + rque : np.array(-1, dtype=int64) Paired queue of reads. + rels : np.array(-1, dtype=uint32) + Effective lengths of reads. - Yields + Returns ------ - int - Read index. - int - Gene index. + list of tuple of (int, int) + Indices of matched reads and genes. See Also -------- @@ -518,216 +677,130 @@ def match_read_gene_quart(geneque, readque): This method uses bisection to find the insertion point of read start in the pre-sorted gene queue, which has O(logn) time. """ - n = len(geneque) # entire search space - mid = n // 2 # mid point + matches = [] + matches_append = matches.append + + # genes within read region + within = {} if nojit else Dict.empty(key_type=int64, value_type=int64) + within_pop = within.pop + + # genes outside read region + # there is some numba issue, otherwise set may be more suitable + outside = {} if nojit else Dict.empty(key_type=int64, value_type=boolean) + outside_pop = outside.pop + + n = gque.size # entire search space + mid = n // 2 # mid point # iterate over paired read starts and ends - it = iter(readque) + it = iter(rque) for x, y in zip(it, it): # pre-calculate read metrics - rid = x & (1 << 30) - 1 # index - beg = x >> 48 # start coordinate - end = y >> 48 # end coordinate - L = x >> 31 & 131071 # effective length + rid = x & (1 << 22) - 1 # index + beg = x >> 24 # start coordinate + end = y >> 24 # end coordinate + L = rels[rid] # effective length - # genes starting within read region - # will record their coordinates - within = {} - within_pop = within.pop + # locate read start using bisection + i = np.searchsorted(gque, x, side='right') # read starts in left half of gene queue - if x < geneque[mid]: - - # genes starting before read region - # no need to record coordinates as overlap is not possible - before = set() - before_add = before.add - before_remove = before.remove - - # locate read start using bisection - # Python's `bisect` is more efficient than homebrew code - i = bisect(geneque, x, hi=mid) + if i < mid: # iterate over gene positions before read region - for code in geneque[:i]: + for code in gque[:i]: # add gene to cache when it starts - if code & (1 << 31): - before_add(code & (1 << 30) - 1) + if not code & (1 << 23): + outside[code & (1 << 22) - 1] = True # drop gene from cache when it ends else: - before_remove(code & (1 << 30) - 1) + outside_pop(code & (1 << 22) - 1) # iterate over gene positions after read start - for code in geneque[i:]: + for code in gque[i:]: # stop when exceeding read end if code > y: break # when gene starts, add it and its coordinate to cache - elif code & (1 << 31): - within[code & (1 << 30) - 1] = code >> 48 + elif not code & (1 << 23): + within[code & (1 << 22) - 1] = code >> 24 # when gene ends, check overlap and remove it from cache - else: - gid = code & (1 << 30) - 1 - - # most genes should start before read region - try: - before_remove(gid) - - # if gene started within read region, - # overlap = gene end - gene start - except KeyError: - if (code >> 48) - within_pop(gid) >= L: - yield rid, gid - - # if gene started before read region, - # overlap = gene end - read start - else: - if (code >> 48) - beg >= L: - yield rid, gid - - # yield all genes that started before read but not yet ended - for gid in before: - yield rid, gid - - # yield genes that started after read but not yet ended, + # if gene started before read region (most cases), + # overlap = gene end - read start + elif outside_pop(code & (1 << 22) - 1, False): + if (code >> 24) - beg >= L: + matches_append((rid, code & (1 << 22) - 1)) + + # if gene started within read region, + # overlap = gene end - gene start + elif (code >> 24) - within_pop(code & (1 << 22) - 1) >= L: + matches_append((rid, code & (1 << 22) - 1)) + + # return all genes that started before read but not yet ended + for gid in outside: + matches_append((rid, gid)) + outside.clear() + + # return genes that started after read but not yet ended, # until overlap = read end - gene start is too short for gid, gloc in within.items(): if end - gloc < L: break - yield rid, gid + matches_append((rid, gid)) + within.clear() # read starts in right half of gene queue else: - # genes starting after read region - # no need to record coordinates as overlap is not possible - after = set() - after_add = after.add - after_remove = after.remove - - # locate read start using bisection - i = bisect(geneque, x, lo=mid + 1, hi=n) - # iterate over gene positions within read region - # for i in range(bisect(geneque, x), n): while i < n: - code = geneque[i] + code = gque[i] # stop when exceeding read end if code > y: break + gid = code & (1 << 22) - 1 # when gene starts, add it and its coordinate to cache - if code & (1 << 31): - within[code & (1 << 30) - 1] = code >> 48 + if not code & (1 << 23): + within[gid] = code >> 24 # when gene ends, check overlap and remove it from cache # gene end must be <= read end # if gene not found in cache (gene started before read region), # use read start, otherwise use gene start - elif (code >> 48) - within_pop(code & (1 << 30) - 1, beg) >= L: - yield rid, code & (1 << 30) - 1 + elif (code >> 24) - within_pop(gid, beg) >= L: + matches_append((rid, gid)) # move to next position i += 1 # iterate over gene positions after read region - for code in geneque[i:]: + for code in gque[i:]: + gid = code & (1 << 22) - 1 # add gene to cache when it starts - if code & (1 << 31): - after_add(code & (1 << 30) - 1) + if not code & (1 << 23): + outside[gid] = True # when gene ends, - else: - - # most remaining genes should start after read region - try: - after_remove(code & (1 << 30) - 1) + # most remaining genes should start after read region + # otherwise, + elif not outside_pop(gid, False): - # otherwise, the gene spans over entire read region - except KeyError: - - # check overlap while removing gene from cache - # gene end must be >= read end - # if gene not found in cache, use read start, otherwise - # use gene start - if end - within_pop(code & (1 << 30) - 1, beg) >= L: - yield rid, code & (1 << 30) - 1 - - -def match_read_gene_dummy(queue, lens, th): - """Associate reads with genes based on a sorted queue of coordinates. - Parameters - ---------- - queue : list of tuple of (int, bool, bool, int) - Sorted queue of coordinates (location, start or end, gene or read, - index). - lens : dict - Read-to-alignment length map. - th : float - Threshold for read/gene overlapping fraction. - - Yields - ------ - int - Read index. - int - Gene index. - - See Also - -------- - match_read_gene - .tests.test_ordinal.OrdinalTests.test_match_read_gene_dummy - - Notes - ----- - This is a dummy function which is only for test and demonstration purpose, - but is not called anywhere in the program. - - The formal function `match_read_gene` extensively uses bitwise operations - and thus is hard to read. Therefore the current function, which represents - the original form of prior to optimization, is retained. - """ - genes = {} # current genes - reads = {} # current reads - - # walk through flattened queue of reads and genes - for loc, is_start, is_gene, idx in queue: - if is_gene: - - # when a gene starts, added to gene cache - if is_start: - genes[idx] = loc - - # when a gene ends, - else: - - # find gene start and remove it from cache - gloc = genes.pop(idx) - - # check cached reads for matches - for rid, rloc in reads.items(): - - # is a match if read/gene overlap is long enough - if loc - max(gloc, rloc) >= lens[rid] * th: - yield rid, idx - - # the same for reads - else: - if is_start: - reads[idx] = loc - else: - rloc = reads.pop(idx) - for gid, gloc in genes.items(): - if loc - max(rloc, gloc) >= lens[idx] * th: - yield idx, gid + # check overlap while removing gene from cache + # gene end must be >= read end + # if gene not found in cache, use read start, otherwise + # use gene start + if end - within_pop(gid, beg) >= L: + matches_append((rid, gid)) + return matches def calc_gene_lens(mapper): @@ -750,13 +823,13 @@ def calc_gene_lens(mapper): idmap_ = idmap[nucl] nucl += '_' for code in queue: - gid = idmap_[code & (1 << 30) - 1] + gid = idmap_[code & (1 << 22) - 1] if prefix: gid = nucl + gid - if code >> 31 & 131071: - res[gid] = -(code >> 48) - else: - res[gid] += code >> 48 + if not code & (1 << 23): # start + res[gid] = -(code >> 24) + else: # end + res[gid] += code >> 24 return res @@ -785,6 +858,7 @@ def load_gene_lens(fh): >>> mapper = partial(ordinal_mapper, coords=coords, idmap=idmap, prefix=prefix) >>> sizemap = calc_gene_lens(mapper) + """ lens, lens_ = {}, None used, isdup = set(), None diff --git a/woltka/range.py b/woltka/range.py index 7538ca0..312b5d4 100644 --- a/woltka/range.py +++ b/woltka/range.py @@ -99,12 +99,13 @@ def merge_ranges(ranges): list of int Merged ranges. - Notes - ----- + Examples + -------- Ranges that have overlaps will be merged into one. For example: >>> merge_ranges([1, 3, 2, 4, 6, 8, 7, 9]) [1, 4, 6, 9] + """ res = [] res_extend = res.extend diff --git a/woltka/table.py b/woltka/table.py index 59d487e..0e6ecf1 100644 --- a/woltka/table.py +++ b/woltka/table.py @@ -59,37 +59,41 @@ def prep_table(profile, samples=None, tree=None, rankdic=None, namedic=None, list of dict Metadata (extra columns, or BIOM observation metadata). + See Also + -------- + write_tsv + .biom.table_to_biom + .workflow.write_profiles + + Notes + ----- + Optionally, three metadata columns, "Name", "Rank" and "Lineage" will be + appended to the table. + + A feature will be dropped if all its values are zero. However, samples will + not be dropped even when empty. + + A stratified feature will be printed as "stratum|feature". + Examples -------- Convert output to a BIOM table: + >>> import biom >>> args = profile, samples, tree, rankdic, namedic, name_as_id >>> table = biom.Table(prep_table(*args)) Convert output to a Pandas DataFrame (data only): + >>> import pandas as pd >>> data, features, samples, metadata = prep_table(*args) >>> df = pd.DataFrame(data, features, samples) Convert output to a Pandas DataFrame (data and metadata): + >>> df = pd.concat([pd.DataFrame(data, features, samples), pd.DataFrame.from_records(metadata, features)], axis=1) - See Also - -------- - write_tsv - .biom.table_to_biom - .workflow.write_profiles - - Notes - ----- - Optionally, three metadata columns, "Name", "Rank" and "Lineage" will be - appended to the table. - - A feature will be dropped if all its values are zero. However, samples will - not be dropped even when empty. - - A stratified feature will be printed as "stratum|feature". """ # determine range and order of samples samples = [x for x in samples if x in profile] if samples else sorted( diff --git a/woltka/tests/test_align.py b/woltka/tests/test_align.py index e83815a..6b94e70 100644 --- a/woltka/tests/test_align.py +++ b/woltka/tests/test_align.py @@ -18,7 +18,7 @@ from woltka.align import ( plain_mapper, iter_align, infer_align_format, assign_parser, parse_sam_file, parse_sam_file_ex, parse_sam_file_ft, parse_sam_file_ex_ft, - cigar_to_lens, cigar_to_lens_ord, parse_sam_file_pd, + cigar_to_lens, cigar_to_lens_ord, parse_map_file, parse_map_file_ft, parse_b6o_file, parse_b6o_file_ex, parse_b6o_file_ft, parse_b6o_file_ex_ft, parse_paf_file, parse_paf_file_ex, parse_paf_file_ft, parse_paf_file_ex_ft) @@ -354,9 +354,6 @@ def test_cigar_to_lens_ord(self): self.assertTupleEqual(cigar_to_lens_ord('150M'), (150, 150)) self.assertTupleEqual(cigar_to_lens_ord('3M1I3M1D5M'), (11, 12)) - def test_parse_sam_file_pd(self): - self.assertIsNone(parse_sam_file_pd([])) - def test_parse_map_file(self): aln = ( 'R1 G1', # 1st line (normal) diff --git a/woltka/tests/test_ordinal.py b/woltka/tests/test_ordinal.py index 41d39bf..7f8377a 100644 --- a/woltka/tests/test_ordinal.py +++ b/woltka/tests/test_ordinal.py @@ -19,12 +19,15 @@ from io import StringIO from functools import partial +import numpy as np +import numpy.testing as npt + from woltka.file import openzip from woltka.align import parse_b6o_file_ex, parse_sam_file_ex from woltka.ordinal import ( match_read_gene_dummy, match_read_gene, match_read_gene_naive, - match_read_gene_quart, ordinal_parser_dummy, ordinal_mapper, - load_gene_coords, calc_gene_lens, load_gene_lens) + match_read_gene_quart, ordinal_mapper, flush_chunk, + load_gene_coords, encode_genes, calc_gene_lens, load_gene_lens) class OrdinalTests(TestCase): @@ -85,11 +88,11 @@ def test_match_read_gene_dummy(self): # flatten lists genes = [x for id_, beg, end in genes for x in ( - (beg, True, True, id_), - (end, False, True, id_))] + (beg, False, True, id_), + (end, True, True, id_))] reads = [x for id_, beg, end in reads for x in ( - (beg, True, False, id_), - (end, False, False, id_))] + (beg, False, False, id_), + (end, True, False, id_))] # merge genes and reads and sort queue = sorted(genes + reads) @@ -118,157 +121,134 @@ def test_match_read_gene(self): """This test is for the real function. """ # gene table - genes = [(1, 4, 29), - (2, 32, 61), - (3, 64, 94)] + genes = [(5, 29), # 0 + (33, 61), # 1 + (65, 94)] # 2 # read map - reads = [(1, 9, 29), - (2, 15, 35), - (3, 19, 39), - (4, 21, 41), - (5, 29, 49), - (6, 29, 49), # identical - (7, 59, 79), - (8, 64, 84), - (9, 81, 95)] # shorter + reads = [(10, 29), # 0 + (16, 35), # 1 + (20, 39), # 2 + (22, 41), # 3 + (30, 49), # 4 + (30, 49), # 5 (identical to 4) + (60, 79), # 6 + (65, 84), # 7 + (82, 95)] # 8 (shorter than others) # read length is uniformly 20, threshold is 80%, so effective # alignment length is 20 * 0.8 = 16 - genes = [x for idx, beg, end in genes for x in ( - (beg << 48) + (1 << 31) + (1 << 30) + idx, - (end << 48) + (0 << 31) + (1 << 30) + idx)] - reads = [x for idx, beg, end in reads for x in ( - (beg << 48) + (16 << 31) + (0 << 30) + idx, - (end << 48) + (0 << 31) + (0 << 30) + idx)] - queue = sorted(genes + reads) + rels = np.full((len(reads),), 16, dtype=np.uint32) + + # flatten genes and reads + genes = np.array([x for i, (beg, end) in enumerate(genes) for x in ( + (beg << 24) + (1 << 22) + i, + (end << 24) + (3 << 22) + i)]) + reads = np.array([x for i, (beg, end) in enumerate(reads) for x in ( + (beg << 24) + i, + (end << 24) + (1 << 23) + i)]) + + # merge and sort genes and reads + queue = np.concatenate((genes, reads)) + queue.sort() # default - obs = list(match_read_gene(queue)) - exp = [(1, 1), - (5, 2), - (6, 2), - (8, 3)] + obs = list(match_read_gene(queue, rels)) + exp = [(0, 0), + (4, 1), + (5, 1), + (7, 2)] self.assertListEqual(obs, exp) def test_match_read_gene_naive(self): """The naive solution should produce identical result compared to the default (ordinal) solution. """ - genes = [(1, 4, 29), - (2, 32, 61), - (3, 63, 94)] - reads = [(1, 9, 29), - (2, 15, 35), - (3, 19, 39), - (4, 21, 41), - (5, 29, 49), - (6, 29, 49), - (7, 59, 79), - (8, 64, 84), - (9, 81, 95)] - genes = [x for idx, beg, end in genes for x in ( - (beg << 48) + (1 << 31) + (1 << 30) + idx, - (end << 48) + (0 << 31) + (1 << 30) + idx)] - reads = [x for idx, beg, end in reads for x in ( - (beg << 48) + (14 << 31) + (0 << 30) + idx, - (end << 48) + (0 << 31) + (0 << 30) + idx)] + genes = [(4, 29), + (32, 61), + (64, 94)] + reads = [(9, 29), + (15, 35), + (19, 39), + (21, 41), + (29, 49), + (29, 49), + (59, 79), + (64, 84), + (81, 95)] + # shorten effective length + rels = np.full((len(reads),), 14, dtype=np.uint32) + genes = np.array([x for i, (beg, end) in enumerate(genes) for x in ( + (beg << 24) + (1 << 22) + i, + (end << 24) + (3 << 22) + i)]) + reads = np.array([x for i, (beg, end) in enumerate(reads) for x in ( + (beg << 24) + i, + (end << 24) + (1 << 23) + i)]) # don't sort, but directly feed both queues - obs = list(match_read_gene_naive(genes, reads)) - exp = [(1, 1), - (2, 1), - (5, 2), + obs = list(match_read_gene_naive(genes, reads, rels)) + exp = [(0, 0), + (1, 0), + (4, 1), + (5, 1), (6, 2), - (7, 3), - (8, 3)] + (7, 2)] self.assertListEqual(obs, exp) def test_match_read_gene_quart(self): """The naive solution should produce identical result compared to the default (ordinal) solution. """ - genes = [(1, 4, 29), - (2, 32, 61), - (3, 64, 94), - (4, 60, 76), # added a small gene within a read - (5, 66, 72)] # added a tiny gene - reads = [(1, 9, 29), - (2, 15, 35), - (3, 19, 39), - (4, 21, 41), - (5, 29, 49), - (6, 29, 49), - (7, 59, 79), - (8, 64, 84), - (9, 81, 95)] - genes = [x for idx, beg, end in genes for x in ( - (beg << 48) + (1 << 31) + (1 << 30) + idx, - (end << 48) + (0 << 31) + (1 << 30) + idx)] + genes = [(4, 29), + (32, 61), + (64, 94), + (60, 76), # added a small gene within a read + (66, 72)] # added a tiny gene + reads = [(9, 29), + (15, 35), + (19, 39), + (21, 41), + (29, 49), + (29, 49), + (59, 79), + (64, 84), + (69, 75), # added a small read starting in right half + (81, 95)] + rels = np.full((len(reads),), 14, dtype=np.uint32) + genes = np.array([x for i, (beg, end) in enumerate(genes) for x in ( + (beg << 24) + (1 << 22) + i, + (end << 24) + (3 << 22) + i)]) genes.sort() - reads = [x for idx, beg, end in reads for x in ( - (beg << 48) + (14 << 31) + (0 << 30) + idx, - (end << 48) + (0 << 31) + (0 << 30) + idx)] - - obs = list(match_read_gene_quart(genes, reads)) - exp = [(1, 1), - (2, 1), - (5, 2), + reads = np.array([x for i, (beg, end) in enumerate(reads) for x in ( + (beg << 24) + i, + (end << 24) + (1 << 23) + i)]) + + obs = list(match_read_gene_quart(genes, reads, rels)) + exp = [(0, 0), + (1, 0), + (4, 1), + (5, 1), + (6, 3), # match to added small gene (6, 2), - (7, 4), # match to added small gene - (7, 3), - (8, 3)] + (7, 2)] self.assertListEqual(obs, exp) # a special case with a giant read - genes = [(1, 0, 5), - (2, 5, 7), - (3, 6, 8)] - reads = [(1, 3, 9)] - genes = [x for idx, beg, end in genes for x in ( - (beg << 48) + (1 << 31) + (1 << 30) + idx, - (end << 48) + (0 << 31) + (1 << 30) + idx)] + genes = [(0, 5), + (5, 7), + (6, 8)] + reads = [(3, 9)] + rels = np.array([5], dtype=np.uint32) + genes = np.array([x for i, (beg, end) in enumerate(genes) for x in ( + (beg << 24) + (1 << 22) + i, + (end << 24) + (3 << 22) + i)]) genes.sort() - reads = [x for idx, beg, end in reads for x in ( - (beg << 48) + (5 << 31) + (0 << 30) + idx, - (end << 48) + (0 << 31) + (0 << 30) + idx)] - obs = list(match_read_gene_quart(genes, reads)) + reads = np.array([x for i, (beg, end) in enumerate(reads) for x in ( + (beg << 24) + i, + (end << 24) + (1 << 23) + i)]) + obs = list(match_read_gene_quart(genes, reads, rels)) exp = [] self.assertListEqual(obs, exp) - def test_ordinal_parser_dummy(self): - # b6o (BLAST, DIAMOND, BURST, etc.) - b6o = iter(( - 'S1/1 NC_123456 100 100 0 0 1 100 225 324 1.2e-30 345', - 'S1/2 NC_123456 95 98 2 1 2 99 708 608 3.4e-20 270')) - obs = ordinal_parser_dummy(b6o, parse_b6o_file_ex) - self.assertListEqual(obs[0], ['S1/1', 'S1/2']) - self.assertDictEqual(obs[1], {'NC_123456': {0: 100, 1: 98}}) - self.assertDictEqual(obs[2], {'NC_123456': [ - (224, True, False, 0), (324, False, False, 0), - (607, True, False, 1), (708, False, False, 1)]}) - - # sam (BWA, Bowtie2, Minimap2 etc.) - sam = iter(( - # SAM header to be ignored - '@HD VN:1.0 SO:unsorted', - # normal, fully-aligned, forward strand - 'S1 77 NC_123456 26 0 100M * 0 0 * *', - # shortened, reverse strand - 'S1 141 NC_123456 151 0 80M * 0 0 * *', - # not perfectly aligned, unpaired - 'S2 0 NC_789012 186 0 50M5I20M5D20M * 0 0 * *', - # unaligned - 'S2 16 * 0 0 * * 0 0 * *')) - obs = ordinal_parser_dummy(sam, parse_sam_file_ex) - self.assertListEqual(obs[0], ['S1/1', 'S1/2', 'S2']) - self.assertDictEqual(obs[1], { - 'NC_123456': {0: 100, 1: 80}, - 'NC_789012': {2: 90}}) - self.assertDictEqual(obs[2], { - 'NC_123456': [(25, True, False, 0), (125, False, False, 0), - (150, True, False, 1), (230, False, False, 1)], - 'NC_789012': [(185, True, False, 2), (280, False, False, 2)]}) - def test_ordinal_mapper(self): # uses the same example as above, with some noises coords, idmap, _ = load_gene_coords(( @@ -332,6 +312,44 @@ def test_ordinal_mapper(self): self.assertListEqual(list(obs[0]), [x[0] for x in exp]) self.assertListEqual(list(obs[1]), [{x[1]} for x in exp]) + def test_flush_chunk(self): + coords, idmap, _ = load_gene_coords(( + '>n1', + 'g1 5 29', + 'g2 33 61', + 'g3 65 94', + 'gx 108 135')) + aln = StringIO('\n'.join(( + 'r1 n1 95 20 0 0 1 20 10 29 1 1', + 'r2 n1 95 20 0 0 1 20 16 35 1 1', + 'r3 n1 95 20 0 0 1 20 20 39 1 1', + 'r4 n1 95 20 0 0 20 1 22 41 1 1', + 'r5 n1 95 20 0 0 20 1 30 49 1 1', + 'rx nx 95 20 0 0 1 20 1 20 1 1', + 'r6 n1 95 20 0 0 1 20 49 30 1 1', + 'r7 n1 95 20 0 0 25 6 79 60 1 1', + 'r8 n1 95 20 0 0 1 20 84 65 1 1', + 'r9 n1 95 20 0 0 1 20 95 82 1 1', + 'rx nx 95 0 0 0 0 0 0 0 1 1', + '# end of file'))) + idx, rids, rlens, locmap = 0, [], [], {} + for query, records in parse_b6o_file_ex(aln): + for subject, _, length, beg, end in records: + rids.append(query) + rlens.append(length) + locmap.setdefault(subject, []).extend(( + (beg << 24) + idx, (end << 24) + (1 << 23) + idx)) + idx += 1 + rlens = np.array(rlens) + obs = flush_chunk( + len(rids), locmap, rids, rlens, coords, idmap, 0.8, False) + exp = [('r1', 'g1'), + ('r5', 'g2'), + ('r6', 'g2'), + ('r8', 'g3')] + self.assertListEqual(list(obs[0]), [x[0] for x in exp]) + self.assertListEqual(list(obs[1]), [{x[1]} for x in exp]) + def test_load_gene_coords(self): # simple case tbl = ('>n1', @@ -342,13 +360,14 @@ def test_load_gene_coords(self): self.assertFalse(isdup) self.assertDictEqual(idmap, {'n1': ['g1', 'g2', 'g3']}) exp = {'n1': [ - (4 << 48) + (3 << 30) + 0, - (29 << 48) + (1 << 30) + 0, - (32 << 48) + (3 << 30) + 1, - (61 << 48) + (1 << 30) + 1, - (64 << 48) + (3 << 30) + 2, - (94 << 48) + (1 << 30) + 2]} - self.assertDictEqual(obs, exp) + (4 << 24) + (1 << 22) + 0, + (29 << 24) + (3 << 22) + 0, + (32 << 24) + (1 << 22) + 1, + (61 << 24) + (3 << 22) + 1, + (64 << 24) + (1 << 22) + 2, + (94 << 24) + (3 << 22) + 2]} + self.assertListEqual(list(obs.keys()), list(exp.keys())) + npt.assert_array_equal(obs['n1'], exp['n1']) # NCBI accession tbl = ('## GCF_000123456\n', @@ -363,24 +382,31 @@ def test_load_gene_coords(self): self.assertDictEqual(idmap, { 'NC_123456': ['1', '2'], 'NC_789012': ['1', '2']}) exp = {'NC_123456': [ - (4 << 48) + (3 << 30) + 0, - (384 << 48) + (1 << 30) + 0, - (409 << 48) + (3 << 30) + 1, - (933 << 48) + (1 << 30) + 1], + (4 << 24) + (1 << 22) + 0, + (384 << 24) + (3 << 22) + 0, + (409 << 24) + (1 << 22) + 1, + (933 << 24) + (3 << 22) + 1], 'NC_789012': [ - (74 << 48) + (3 << 30) + 1, - (529 << 48) + (1 << 30) + 1, - (637 << 48) + (3 << 30) + 0, - (912 << 48) + (1 << 30) + 0]} - self.assertDictEqual(obs, exp) + (74 << 24) + (1 << 22) + 1, + (529 << 24) + (3 << 22) + 1, + (637 << 24) + (1 << 22) + 0, + (912 << 24) + (3 << 22) + 0]} + self.assertListEqual(list(obs.keys()), list(exp.keys())) + for key in obs.keys(): + npt.assert_array_equal(obs[key], exp[key]) # don't sort obs = load_gene_coords(tbl, sort=False)[0]['NC_789012'] - exp = [(637 << 48) + (3 << 30) + 0, (912 << 48) + (1 << 30) + 0, - (74 << 48) + (3 << 30) + 1, (529 << 48) + (1 << 30) + 1] - self.assertListEqual(obs, exp) + exp = [(637 << 24) + (1 << 22) + 0, (912 << 24) + (3 << 22) + 0, + (74 << 24) + (1 << 22) + 1, (529 << 24) + (3 << 22) + 1] + npt.assert_array_equal(obs, exp) # incorrect formats + # empty file + msg = 'No coordinate was read from file.' + with self.assertRaises(ValueError) as ctx: + load_gene_coords(()) + self.assertEqual(str(ctx.exception), msg) # only one column msg = 'Cannot extract coordinates from line:' with self.assertRaises(ValueError) as ctx: @@ -390,10 +416,6 @@ def test_load_gene_coords(self): with self.assertRaises(ValueError) as ctx: load_gene_coords(('hello\t100',)) self.assertEqual(str(ctx.exception), f'{msg} "hello\t100".') - # three columns but 3rd is string - with self.assertRaises(ValueError) as ctx: - load_gene_coords(('hello\t100\tthere',)) - self.assertEqual(str(ctx.exception), f'{msg} "hello\t100\tthere".') # real coords file fp = join(self.datdir, 'function', 'coords.txt.xz') @@ -404,20 +426,35 @@ def test_load_gene_coords(self): self.assertEqual(len(obs), 107) obs_ = obs['G000006745'] self.assertEqual(len(obs_), 7188) - self.assertEqual(obs_[0], (371 << 48) + (3 << 30) + 0) - self.assertEqual(obs_[1], (806 << 48) + (1 << 30) + 0) - self.assertEqual(obs_[2], (815 << 48) + (3 << 30) + 1) - self.assertEqual(obs_[3], (2177 << 48) + (1 << 30) + 1) + self.assertEqual(obs_[0], (371 << 24) + (1 << 22) + 0) + self.assertEqual(obs_[1], (806 << 24) + (3 << 22) + 0) + self.assertEqual(obs_[2], (815 << 24) + (1 << 22) + 1) + self.assertEqual(obs_[3], (2177 << 24) + (3 << 22) + 1) + + def test_encode_genes(self): + lst = ['5', '384', '410', '933', '912', '638', '529', '75'] + obs = encode_genes(lst) + exp = np.array([ + (4 << 24) + (1 << 22) + 0, (384 << 24) + (3 << 22) + 0, + (409 << 24) + (1 << 22) + 1, (933 << 24) + (3 << 22) + 1, + (637 << 24) + (1 << 22) + 2, (912 << 24) + (3 << 22) + 2, + (74 << 24) + (1 << 22) + 3, (529 << 24) + (3 << 22) + 3]) + npt.assert_array_equal(obs, exp) + + # coordinate not a number + with self.assertRaises(ValueError) as ctx: + encode_genes(['hello', 'there']) + self.assertEqual(str(ctx.exception), 'Invalid coordinate(s) found.') def test_calc_gene_lens(self): - coords = {'NC_123456': [(4 << 48) + (3 << 30) + 0, - (384 << 48) + (1 << 30) + 0, - (409 << 48) + (3 << 30) + 1, - (933 << 48) + (1 << 30) + 1], - 'NC_789012': [(74 << 48) + (3 << 30) + 1, - (529 << 48) + (1 << 30) + 1, - (637 << 48) + (3 << 30) + 0, - (912 << 48) + (1 << 30) + 0]} + coords = {'NC_123456': [(4 << 24) + (1 << 22) + 0, + (384 << 24) + (3 << 22) + 0, + (409 << 24) + (1 << 22) + 1, + (933 << 24) + (3 << 22) + 1], + 'NC_789012': [(74 << 24) + (1 << 22) + 1, + (529 << 24) + (3 << 22) + 1, + (637 << 24) + (1 << 22) + 0, + (912 << 24) + (3 << 22) + 0]} idmap = {'NC_123456': ['1', '2'], 'NC_789012': ['1', '2']} mapper = partial(ordinal_mapper, coords=coords, idmap=idmap, diff --git a/woltka/tests/test_workflow.py b/woltka/tests/test_workflow.py index 2727ce5..9205245 100644 --- a/woltka/tests/test_workflow.py +++ b/woltka/tests/test_workflow.py @@ -294,7 +294,7 @@ def test_build_mapper(self): # plain obs = build_mapper() self.assertEqual(obs[0].__name__, 'plain_mapper') - self.assertEqual(obs[1], 1000) + self.assertEqual(obs[1], 1024) # ordinal fp = join(self.tmpdir, 'coords.txt') @@ -302,7 +302,7 @@ def test_build_mapper(self): f.write('>G1\n1\t10\t20\n2\t35\t50\n') obs = build_mapper(fp) self.assertEqual(obs[0].func.__name__, 'ordinal_mapper') - self.assertEqual(obs[1], 1000000) + self.assertEqual(obs[1], 1048576) # ordinal with overlap threshold and chunk size obs = build_mapper(fp, overlap=75, chunk=50000) diff --git a/woltka/workflow.py b/woltka/workflow.py index 63be500..70812fe 100644 --- a/woltka/workflow.py +++ b/woltka/workflow.py @@ -568,8 +568,8 @@ def build_mapper(coords_fp: str = None, presence of a gene coordinates file (`coords_fp`) is an indicator for using the latter. - The chunk size, if not specified, is determined empirically: 1,000 for - plain mapper and 1,000,000 for ordinal mapper. + The chunk size, if not specified, is determined empirically: 1,024 for + plain mapper and 2 ** 20 = 1,048,576 for ordinal mapper. """ if coords_fp: click.echo('Reading gene coordinates...', nl=False) @@ -577,11 +577,11 @@ def build_mapper(coords_fp: str = None, coords, idmap, prefix = load_gene_coords(fh, sort=True) click.echo(' Done.') click.echo(f' Total number of host sequences: {len(coords)}.') - chunk = chunk or 1000000 + chunk = chunk or 2**20 return partial(ordinal_mapper, coords=coords, idmap=idmap, prefix=prefix, th=overlap and overlap / 100), chunk else: - chunk = chunk or 1000 + chunk = chunk or 1024 return range_mapper if outcov_dir else plain_mapper, chunk