|
1 | 1 | #!/usr/bin/env python3
|
2 | 2 |
|
3 |
| -import argparse |
| 3 | +"""build_baited_bloom_filter: script to build a bloom filter from a set of |
| 4 | +(genomic) reads that have related to a fasta file (a transcriptome)""" |
| 5 | + |
4 | 6 | import logging
|
5 |
| -import sys |
6 |
| -from os.path import isfile, exists, dirname, abspath |
7 |
| -from os import \ |
8 |
| - makedirs, \ |
9 |
| - remove |
| 7 | +from os.path import exists, dirname, abspath |
| 8 | +from os import makedirs |
10 | 9 | from shutil import which
|
11 | 10 |
|
12 |
| -from exfi import __version__ |
13 |
| -from exfi.build_baited_bloom_filter import build_baited_bloom_filter |
14 |
| - |
15 |
| -parser = argparse.ArgumentParser( |
16 |
| - usage='build_baited_bloom_filter ' |
17 |
| - '-i transcriptome.fa ' |
18 |
| - '-o bloom_filter.bf ' |
19 |
| - '-k 30 ' |
20 |
| - 'reads1.fq ... readsn.fq', |
21 |
| - description='Build a Bloom filter with reads that have at least one kmer ' |
22 |
| - 'in the transcriptome.', |
23 |
| - epilog='Jorge Langa. Send issues and pull requests to github.com/jlanga/' |
24 |
| - 'exfi' |
25 |
| -) |
26 |
| - |
27 |
| -parser.add_argument( |
28 |
| - '--version', |
29 |
| - action='version', |
30 |
| - version='%(prog)s {version}'.format( |
31 |
| - version=__version__ |
32 |
| - ) |
33 |
| -) |
34 |
| - |
35 |
| -parser.add_argument( |
36 |
| - '--input-fasta', '-f', |
37 |
| - type=str, |
38 |
| - required=True, |
39 |
| - help='Input transcriptome in FASTA format', |
40 |
| - dest='fasta', |
41 |
| - metavar='FILE' |
42 |
| -) |
43 |
| - |
44 |
| -parser.add_argument( |
45 |
| - '--kmer-size', '-k', |
46 |
| - type=int, |
47 |
| - required=False, |
48 |
| - help='The size of the k-mer [31]', |
49 |
| - dest='kmer', |
50 |
| - metavar='INT', |
51 |
| - default=31 |
52 |
| -) |
53 |
| - |
54 |
| -parser.add_argument( |
55 |
| - '--bloom-size', '-b', |
56 |
| - type=str, |
57 |
| - required=False, |
58 |
| - help="Size of the Bloom filter [500M]. This is the total size. The final Bloom filter will be" |
59 |
| - "size / levels.", |
60 |
| - dest="bloom_size", |
61 |
| - metavar='STR', |
62 |
| - default='500M' |
63 |
| -) |
64 |
| - |
65 |
| -parser.add_argument( |
66 |
| - '--levels', '-l', |
67 |
| - type=int, |
68 |
| - required=False, |
69 |
| - help='Build a cascading bloom filter with N levels and ' |
70 |
| - 'output the last level [1]', |
71 |
| - dest='levels', |
72 |
| - metavar='INT', |
73 |
| - default=1 |
74 |
| -) |
| 11 | +from exfi.arguments import build_baited_bloom_filter_args |
| 12 | +from exfi.logger import set_up_logger |
75 | 13 |
|
76 |
| -parser.add_argument( |
77 |
| - '--threads', '-t', |
78 |
| - type=int, |
79 |
| - required=False, |
80 |
| - help='Number of threads to build Bloom filters and bait reads', |
81 |
| - dest='threads', |
82 |
| - metavar='INT', |
83 |
| - default=1 |
84 |
| -) |
85 |
| - |
86 |
| -parser.add_argument( |
87 |
| - '--output-bloom', '-o', |
88 |
| - type=str, |
89 |
| - required=True, |
90 |
| - help='Path to write the resulting Bloom filter', |
91 |
| - dest="bloom", |
92 |
| - metavar="FILE" |
93 |
| -) |
94 |
| - |
95 |
| -parser.add_argument( |
96 |
| - metavar='reads', |
97 |
| - type=str, |
98 |
| - nargs='+', |
99 |
| - help='FASTA/Q files (gz or not)', |
100 |
| - dest='reads' |
101 |
| -) |
102 |
| - |
103 |
| -parser.add_argument( |
104 |
| - "-v", "--verbose", |
105 |
| - action="store_true", |
106 |
| - dest="verbose", |
107 |
| - help="Increase output verbosity" |
108 |
| -) |
109 |
| - |
110 |
| -parser.add_argument( |
111 |
| - "-d", "--debug", |
112 |
| - action="store_true", |
113 |
| - dest="debug", |
114 |
| - help="Log everything!" |
115 |
| -) |
| 14 | +from exfi.build_baited_bloom_filter import build_baited_bloom_filter |
116 | 15 |
|
117 | 16 | if __name__ == '__main__':
|
118 | 17 |
|
119 |
| - # Store arguments |
120 |
| - args = vars(parser.parse_args()) |
121 |
| - args["fasta"] = abspath(args["fasta"]) |
122 |
| - args["bloom"] = abspath(args["bloom"]) |
123 |
| - args["threads"] = int(args["threads"]) # I don't know why it is parsed as tuple |
| 18 | + PARSER = build_baited_bloom_filter_args() |
124 | 19 |
|
125 |
| - # Set up logger |
126 |
| - logger = logging.getLogger() |
127 |
| - logging.basicConfig( |
128 |
| - format='%(asctime)s\t%(module)s\t%(message)s', |
129 |
| - level=logging.ERROR |
130 |
| - ) |
131 |
| - if args["verbose"]: |
132 |
| - logger.setLevel(logging.INFO) |
133 |
| - if args["debug"]: |
134 |
| - logger.setLevel(logging.DEBUG) |
| 20 | + # Store arguments |
| 21 | + ARGS = vars(PARSER.parse_args()) |
| 22 | + ARGS["fasta"] = abspath(ARGS["fasta"]) |
| 23 | + ARGS["bloom"] = abspath(ARGS["bloom"]) |
| 24 | + ARGS["threads"] = int(ARGS["threads"]) # It is parsed as tuple |
135 | 25 |
|
| 26 | + # Set up the logger |
| 27 | + LOGGER = set_up_logger(ARGS) |
136 | 28 |
|
137 | 29 | # Check inputs
|
138 | 30 | logging.info('Checking input parameters')
|
139 |
| - assert args["kmer"] >= 1, 'ERROR: incorrect kmer size' |
| 31 | + assert ARGS["kmer"] >= 1, 'ERROR: incorrect kmer size' |
140 | 32 | # assert bloom_size
|
141 |
| - assert args["levels"] >= 1, 'ERROR: incorrect number of levels' |
142 |
| - assert args["threads"] >= 1, 'ERROR: incorrect number of threads' |
| 33 | + assert ARGS["levels"] >= 1, 'ERROR: incorrect number of levels' |
| 34 | + assert ARGS["threads"] >= 1, 'ERROR: incorrect number of threads' |
143 | 35 |
|
144 | 36 | # Check if programs are in path
|
145 | 37 | logging.info('Checking if biobloom* and abyss-bloom are in $PATH')
|
146 |
| - assert which('biobloommaker') is not None, 'ERROR: biobloommaker not in PATH' |
147 |
| - assert which('biobloomcategorizer') is not None, 'ERROR: biobloomcategorizer not in PATH' |
148 |
| - assert which('abyss-bloom') is not None, 'ERROR: abyss-bloom not in PATH' |
| 38 | + assert which('biobloommaker') is not None, \ |
| 39 | + 'ERROR: biobloommaker not in PATH' |
| 40 | + assert which('biobloomcategorizer') is not None, \ |
| 41 | + 'ERROR: biobloomcategorizer not in PATH' |
| 42 | + assert which('abyss-bloom') is not None, \ |
| 43 | + 'ERROR: abyss-bloom not in PATH' |
149 | 44 |
|
150 | 45 | # Create output folder if it doesn't exist
|
151 |
| - output_dir = dirname(args["bloom"]) |
152 |
| - if output_dir != "" and not exists(output_dir): |
153 |
| - makedirs(output_dir) |
| 46 | + OUTPUT_DIR = dirname(ARGS["bloom"]) |
| 47 | + if OUTPUT_DIR != "" and not exists(OUTPUT_DIR): |
| 48 | + makedirs(OUTPUT_DIR) |
154 | 49 |
|
155 | 50 | # Run the program
|
156 | 51 | logging.info('Running build_baited_bloom_filter')
|
157 |
| - build_baited_bloom_filter(args) |
| 52 | + build_baited_bloom_filter(ARGS) |
158 | 53 |
|
159 | 54 | logging.info("Done!")
|
0 commit comments