Skip to content

Commit 4b582df

Browse files
committed
fix: now not erroring out when a non-ACGT base is in the query file
which is very common
1 parent edd5441 commit 4b582df

File tree

7 files changed

+257
-37
lines changed

7 files changed

+257
-37
lines changed

.gitignore

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,3 +11,4 @@ python/docs/_generated
1111
__pycache__
1212
cobs*.tar
1313
cobs*.tar.gz
14+
!tests/data/non_acgt_test/index.cobs_classic

cobs/util/query.cpp

Lines changed: 34 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -140,44 +140,44 @@ void destroy_mmap(MMapHandle& handle)
140140
close_file(handle.fd);
141141
}
142142

143-
//! forward character map. A -> A, C -> C, G -> G, T -> T. rest maps to zero.
143+
//! forward character map. A -> A, C -> C, G -> G, T -> T. rest also maps to A to handle non-ACGT bases and not error out.
144144
static const char canonicalize_basepair_forward_map[256] = {
145-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
146-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
147-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
148-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
149-
0, 65, 0, 67, 0, 0, 0, 71, 0, 0, 0, 0, 0, 0, 0, 0,
150-
0, 0, 0, 0, 84, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
151-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
152-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
153-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
154-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
155-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
156-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
157-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
158-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
159-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
160-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
145+
65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
146+
65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
147+
65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
148+
65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
149+
65, 65, 65, 67, 65, 65, 65, 71, 65, 65, 65, 65, 65, 65, 65, 65,
150+
65, 65, 65, 65, 84, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
151+
65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
152+
65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
153+
65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
154+
65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
155+
65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
156+
65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
157+
65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
158+
65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
159+
65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
160+
65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
161161
};
162162

163-
//! reverse character map. A -> T, C -> G, G -> C, T -> A. rest maps to zero.
163+
//! reverse character map. A -> T, C -> G, G -> C, T -> A. rest also maps to T to handle non-ACGT bases and not error out.
164164
static const char canonicalize_basepair_reverse_map[256] = {
165-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
166-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
167-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
168-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
169-
0, 84, 0, 71, 0, 0, 0, 67, 0, 0, 0, 0, 0, 0, 0, 0,
170-
0, 0, 0, 0, 65, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
171-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
172-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
173-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
174-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
175-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
176-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
177-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
178-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
179-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
180-
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
165+
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
166+
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
167+
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
168+
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
169+
84, 84, 84, 71, 84, 84, 84, 67, 84, 84, 84, 84, 84, 84, 84, 84,
170+
84, 84, 84, 84, 65, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
171+
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
172+
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
173+
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
174+
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
175+
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
176+
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
177+
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
178+
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
179+
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
180+
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
181181
};
182182

183183
bool canonicalize_kmer(const char* input, char* output, uint64_t size)
81.7 KB
Binary file not shown.
9.19 KB
Binary file not shown.

tests/data/non_acgt_test/truth.out

Lines changed: 184 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,184 @@
1+
*ERR5069949.2151832 1
2+
genome 120
3+
*ERR5069949.576388 1
4+
genome 47
5+
*ERR5069949.501486 1
6+
genome 116
7+
*ERR5069949.1331889 1
8+
genome 102
9+
*ERR5069949.2161340 1
10+
genome 50
11+
*ERR5069949.973930 0
12+
*ERR5069949.2417063 1
13+
genome 120
14+
*ERR5069949.376959 1
15+
genome 121
16+
*ERR5069949.1088785 0
17+
*ERR5069949.1066259 1
18+
genome 117
19+
*ERR5069949.2832676 1
20+
genome 109
21+
*ERR5069949.2953930 1
22+
genome 121
23+
*ERR5069949.324865 0
24+
*ERR5069949.2185111 1
25+
genome 104
26+
*ERR5069949.937422 1
27+
genome 100
28+
*ERR5069949.2431709 0
29+
*ERR5069949.1246538 1
30+
genome 118
31+
*ERR5069949.1189252 1
32+
genome 68
33+
*ERR5069949.2216307 1
34+
genome 98
35+
*ERR5069949.3273002 1
36+
genome 97
37+
*ERR5069949.3277445 1
38+
genome 121
39+
*ERR5069949.3022231 1
40+
genome 117
41+
*ERR5069949.184542 0
42+
*ERR5069949.540529 1
43+
genome 119
44+
*ERR5069949.686090 1
45+
genome 114
46+
*ERR5069949.2787556 1
47+
genome 63
48+
*ERR5069949.2650879 1
49+
genome 120
50+
*ERR5069949.2064910 1
51+
genome 102
52+
*ERR5069949.2328704 1
53+
genome 120
54+
*ERR5069949.1067032 1
55+
genome 99
56+
*ERR5069949.3338256 1
57+
genome 121
58+
*ERR5069949.1412839 1
59+
genome 117
60+
*ERR5069949.1538968 1
61+
genome 120
62+
*ERR5069949.147998 1
63+
genome 64
64+
*ERR5069949.366975 1
65+
genome 76
66+
*ERR5069949.1372331 1
67+
genome 121
68+
*ERR5069949.1709367 1
69+
genome 94
70+
*ERR5069949.2388984 1
71+
genome 120
72+
*ERR5069949.1132353 1
73+
genome 120
74+
*ERR5069949.1151736 1
75+
genome 121
76+
*ERR5069949.479807 1
77+
genome 120
78+
*ERR5069949.2176303 1
79+
genome 121
80+
*ERR5069949.2772897 1
81+
genome 117
82+
*ERR5069949.1020777 1
83+
genome 92
84+
*ERR5069949.465452 1
85+
genome 98
86+
*ERR5069949.1704586 1
87+
genome 119
88+
*ERR5069949.1258508 1
89+
genome 121
90+
*ERR5069949.986441 0
91+
*ERR5069949.2674295 1
92+
genome 118
93+
*ERR5069949.885966 0
94+
*ERR5069949.2342766 1
95+
genome 121
96+
*ERR5069949.3122970 1
97+
genome 97
98+
*ERR5069949.3279513 0
99+
*ERR5069949.309410 1
100+
genome 121
101+
*ERR5069949.532979 1
102+
genome 119
103+
*ERR5069949.2888794 1
104+
genome 121
105+
*ERR5069949.2205229 1
106+
genome 112
107+
*ERR5069949.786562 1
108+
genome 121
109+
*ERR5069949.919671 1
110+
genome 121
111+
*ERR5069949.1328186 1
112+
genome 121
113+
*ERR5069949.870926 1
114+
genome 119
115+
*ERR5069949.2257580 1
116+
genome 121
117+
*ERR5069949.3249622 0
118+
*ERR5069949.611123 1
119+
genome 95
120+
*ERR5069949.651338 0
121+
*ERR5069949.169513 1
122+
genome 62
123+
*ERR5069949.155944 0
124+
*ERR5069949.2033605 1
125+
genome 120
126+
*ERR5069949.2730382 1
127+
genome 112
128+
*ERR5069949.2125592 1
129+
genome 120
130+
*ERR5069949.1062611 1
131+
genome 121
132+
*ERR5069949.1778133 1
133+
genome 117
134+
*ERR5069949.3057020 1
135+
genome 60
136+
*ERR5069949.2972968 0
137+
*ERR5069949.2734474 1
138+
genome 119
139+
*ERR5069949.856527 1
140+
genome 100
141+
*ERR5069949.2098070 1
142+
genome 121
143+
*ERR5069949.1552198 1
144+
genome 120
145+
*ERR5069949.2385514 1
146+
genome 120
147+
*ERR5069949.2270078 1
148+
genome 121
149+
*ERR5069949.114870 1
150+
genome 120
151+
*ERR5069949.2668880 1
152+
genome 117
153+
*ERR5069949.257821 1
154+
genome 109
155+
*ERR5069949.2243023 1
156+
genome 120
157+
*ERR5069949.2605155 1
158+
genome 116
159+
*ERR5069949.1340552 1
160+
genome 121
161+
*ERR5069949.1561137 1
162+
genome 120
163+
*ERR5069949.2361683 1
164+
genome 119
165+
*ERR5069949.2521353 0
166+
*ERR5069949.1261808 0
167+
*ERR5069949.2734873 1
168+
genome 68
169+
*ERR5069949.3017828 1
170+
genome 77
171+
*ERR5069949.573706 1
172+
genome 97
173+
*ERR5069949.1980512 1
174+
genome 121
175+
*ERR5069949.1014693 1
176+
genome 120
177+
*ERR5069949.3184655 1
178+
genome 120
179+
*ERR5069949.29668 0
180+
*ERR5069949.3258358 0
181+
*ERR5069949.1476386 1
182+
genome 121
183+
*ERR5069949.2415814 1
184+
genome 120

tests/non_acgt_test.cpp

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#include "test_util.hpp"
2+
#include <cobs/util/fs.hpp>
3+
#include <gtest/gtest.h>
4+
5+
namespace fs = cobs::fs;
6+
7+
static fs::path base_dir = "data/non_acgt_test";
8+
static fs::path work_dir = base_dir / "work";
9+
10+
class non_acgt_test : public ::testing::Test
11+
{
12+
protected:
13+
static void SetUpTestSuite() {
14+
cobs::error_code ec;
15+
fs::create_directories(work_dir, ec);
16+
}
17+
static void TearDownTestSuite() {
18+
cobs::error_code ec;
19+
fs::remove_all(work_dir, ec);
20+
}
21+
};
22+
23+
24+
TEST_F(non_acgt_test, non_acgt_test_main_test) {
25+
fs::path index_file{base_dir / "index.cobs_classic"};
26+
cobs::ClassicSearch s(index_file);
27+
fs::path query_file{base_dir / "test_1.fastq.gz"};
28+
auto query_out_filepath = work_dir / "query.out";
29+
std::ofstream query_out_fh(query_out_filepath.string());
30+
cobs::process_query(s, 0.80000000000000004, 0, "", query_file.string(), query_out_fh);
31+
query_out_fh.close();
32+
assert_equals_files_with_paths(
33+
base_dir / "truth.out",
34+
query_out_filepath);
35+
}

tests/util.cpp

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,13 +50,13 @@ TEST(util, kmer_canonicalize) {
5050

5151
// one kmer already canonical but containing invalid letters
5252
test_kmer("AGGAAAGTCTTTTACGCTGGGXXXAGAGTGA",
53-
"AGGAAAGTCTTTTACGCTGGG\0\0\0AGAGTGA", false);
53+
"AGGAAAGTCTTTTACGCTGGGAAAAGAGTGA", true);
5454
// one k-mer needing flipping containing invalid letters
5555
test_kmer("TGGAAAGTCTTTTACGCTGGGXXXAGAGTGA",
56-
"TCACTCT\0\0\0CCCAGCGTAAAAGACTTTCCA", false);
56+
"TCACTCTTTTCCCAGCGTAAAAGACTTTCCA", true);
5757
// one kmer containing the invalid letter at the center
5858
test_kmer("AAAAAAAAAAAAAAAXTTTTTTTTTTTTTTT",
59-
"AAAAAAAAAAAAAAA\0TTTTTTTTTTTTTTT", false);
59+
"AAAAAAAAAAAAAAAATTTTTTTTTTTTTTT", true);
6060
}
6161

6262
/******************************************************************************/

0 commit comments

Comments
 (0)