Skip to content

Commit

Permalink
Merge pull request #22 from leoisl/fix/non_acgt_bases_in_query
Browse files Browse the repository at this point in the history
fix: not erroring out when a non-ACGT base is in the query file
  • Loading branch information
leoisl authored Dec 7, 2023
2 parents edd5441 + 099e459 commit 458b5b6
Show file tree
Hide file tree
Showing 8 changed files with 258 additions and 38 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/build.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ jobs:
run: |
mkdir build
cd build
cmake -DCMAKE_C_COMPILER=gcc-11 -DCMAKE_CXX_COMPILER=g++-11 ..
cmake -DCMAKE_C_COMPILER=gcc-11 -DCMAKE_CXX_COMPILER=g++-11 -DSKIP_PYTHON=1 ..
make
make test
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -11,3 +11,4 @@ python/docs/_generated
__pycache__
cobs*.tar
cobs*.tar.gz
!tests/data/non_acgt_test/index.cobs_classic
68 changes: 34 additions & 34 deletions cobs/util/query.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -140,44 +140,44 @@ void destroy_mmap(MMapHandle& handle)
close_file(handle.fd);
}

//! forward character map. A -> A, C -> C, G -> G, T -> T. rest maps to zero.
//! forward character map. A -> A, C -> C, G -> G, T -> T. rest also maps to A to handle non-ACGT bases and not error out.
static const char canonicalize_basepair_forward_map[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 65, 0, 67, 0, 0, 0, 71, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 84, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
65, 65, 65, 67, 65, 65, 65, 71, 65, 65, 65, 65, 65, 65, 65, 65,
65, 65, 65, 65, 84, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65,
};

//! reverse character map. A -> T, C -> G, G -> C, T -> A. rest maps to zero.
//! reverse character map. A -> T, C -> G, G -> C, T -> A. rest also maps to T to handle non-ACGT bases and not error out.
static const char canonicalize_basepair_reverse_map[256] = {
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 84, 0, 71, 0, 0, 0, 67, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 65, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
84, 84, 84, 71, 84, 84, 84, 67, 84, 84, 84, 84, 84, 84, 84, 84,
84, 84, 84, 84, 65, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84,
};

bool canonicalize_kmer(const char* input, char* output, uint64_t size)
Expand Down
Binary file added tests/data/non_acgt_test/index.cobs_classic
Binary file not shown.
Binary file added tests/data/non_acgt_test/test_1.fastq.gz
Binary file not shown.
184 changes: 184 additions & 0 deletions tests/data/non_acgt_test/truth.out
Original file line number Diff line number Diff line change
@@ -0,0 +1,184 @@
*ERR5069949.2151832 1
genome 120
*ERR5069949.576388 1
genome 47
*ERR5069949.501486 1
genome 116
*ERR5069949.1331889 1
genome 102
*ERR5069949.2161340 1
genome 50
*ERR5069949.973930 0
*ERR5069949.2417063 1
genome 120
*ERR5069949.376959 1
genome 121
*ERR5069949.1088785 0
*ERR5069949.1066259 1
genome 117
*ERR5069949.2832676 1
genome 109
*ERR5069949.2953930 1
genome 121
*ERR5069949.324865 0
*ERR5069949.2185111 1
genome 104
*ERR5069949.937422 1
genome 100
*ERR5069949.2431709 0
*ERR5069949.1246538 1
genome 118
*ERR5069949.1189252 1
genome 68
*ERR5069949.2216307 1
genome 98
*ERR5069949.3273002 1
genome 97
*ERR5069949.3277445 1
genome 121
*ERR5069949.3022231 1
genome 117
*ERR5069949.184542 0
*ERR5069949.540529 1
genome 119
*ERR5069949.686090 1
genome 114
*ERR5069949.2787556 1
genome 63
*ERR5069949.2650879 1
genome 120
*ERR5069949.2064910 1
genome 102
*ERR5069949.2328704 1
genome 120
*ERR5069949.1067032 1
genome 99
*ERR5069949.3338256 1
genome 121
*ERR5069949.1412839 1
genome 117
*ERR5069949.1538968 1
genome 120
*ERR5069949.147998 1
genome 64
*ERR5069949.366975 1
genome 76
*ERR5069949.1372331 1
genome 121
*ERR5069949.1709367 1
genome 94
*ERR5069949.2388984 1
genome 120
*ERR5069949.1132353 1
genome 120
*ERR5069949.1151736 1
genome 121
*ERR5069949.479807 1
genome 120
*ERR5069949.2176303 1
genome 121
*ERR5069949.2772897 1
genome 117
*ERR5069949.1020777 1
genome 92
*ERR5069949.465452 1
genome 98
*ERR5069949.1704586 1
genome 119
*ERR5069949.1258508 1
genome 121
*ERR5069949.986441 0
*ERR5069949.2674295 1
genome 118
*ERR5069949.885966 0
*ERR5069949.2342766 1
genome 121
*ERR5069949.3122970 1
genome 97
*ERR5069949.3279513 0
*ERR5069949.309410 1
genome 121
*ERR5069949.532979 1
genome 119
*ERR5069949.2888794 1
genome 121
*ERR5069949.2205229 1
genome 112
*ERR5069949.786562 1
genome 121
*ERR5069949.919671 1
genome 121
*ERR5069949.1328186 1
genome 121
*ERR5069949.870926 1
genome 119
*ERR5069949.2257580 1
genome 121
*ERR5069949.3249622 0
*ERR5069949.611123 1
genome 95
*ERR5069949.651338 0
*ERR5069949.169513 1
genome 62
*ERR5069949.155944 0
*ERR5069949.2033605 1
genome 120
*ERR5069949.2730382 1
genome 112
*ERR5069949.2125592 1
genome 120
*ERR5069949.1062611 1
genome 121
*ERR5069949.1778133 1
genome 117
*ERR5069949.3057020 1
genome 60
*ERR5069949.2972968 0
*ERR5069949.2734474 1
genome 119
*ERR5069949.856527 1
genome 100
*ERR5069949.2098070 1
genome 121
*ERR5069949.1552198 1
genome 120
*ERR5069949.2385514 1
genome 120
*ERR5069949.2270078 1
genome 121
*ERR5069949.114870 1
genome 120
*ERR5069949.2668880 1
genome 117
*ERR5069949.257821 1
genome 109
*ERR5069949.2243023 1
genome 120
*ERR5069949.2605155 1
genome 116
*ERR5069949.1340552 1
genome 121
*ERR5069949.1561137 1
genome 120
*ERR5069949.2361683 1
genome 119
*ERR5069949.2521353 0
*ERR5069949.1261808 0
*ERR5069949.2734873 1
genome 68
*ERR5069949.3017828 1
genome 77
*ERR5069949.573706 1
genome 97
*ERR5069949.1980512 1
genome 121
*ERR5069949.1014693 1
genome 120
*ERR5069949.3184655 1
genome 120
*ERR5069949.29668 0
*ERR5069949.3258358 0
*ERR5069949.1476386 1
genome 121
*ERR5069949.2415814 1
genome 120
35 changes: 35 additions & 0 deletions tests/non_acgt_test.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
#include "test_util.hpp"
#include <cobs/util/fs.hpp>
#include <gtest/gtest.h>

namespace fs = cobs::fs;

static fs::path base_dir = "data/non_acgt_test";
static fs::path work_dir = base_dir / "work";

class non_acgt_test : public ::testing::Test
{
protected:
static void SetUpTestSuite() {
cobs::error_code ec;
fs::create_directories(work_dir, ec);
}
static void TearDownTestSuite() {
cobs::error_code ec;
fs::remove_all(work_dir, ec);
}
};


TEST_F(non_acgt_test, non_acgt_test_main_test) {
fs::path index_file{base_dir / "index.cobs_classic"};
cobs::ClassicSearch s(index_file.string());
fs::path query_file{base_dir / "test_1.fastq.gz"};
auto query_out_filepath = work_dir / "query.out";
std::ofstream query_out_fh(query_out_filepath.string());
cobs::process_query(s, 0.80000000000000004, 0, "", query_file.string(), query_out_fh);
query_out_fh.close();
assert_equals_files_with_paths(
base_dir / "truth.out",
query_out_filepath);
}
6 changes: 3 additions & 3 deletions tests/util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,13 +50,13 @@ TEST(util, kmer_canonicalize) {

// one kmer already canonical but containing invalid letters
test_kmer("AGGAAAGTCTTTTACGCTGGGXXXAGAGTGA",
"AGGAAAGTCTTTTACGCTGGG\0\0\0AGAGTGA", false);
"AGGAAAGTCTTTTACGCTGGGAAAAGAGTGA", true);
// one k-mer needing flipping containing invalid letters
test_kmer("TGGAAAGTCTTTTACGCTGGGXXXAGAGTGA",
"TCACTCT\0\0\0CCCAGCGTAAAAGACTTTCCA", false);
"TCACTCTTTTCCCAGCGTAAAAGACTTTCCA", true);
// one kmer containing the invalid letter at the center
test_kmer("AAAAAAAAAAAAAAAXTTTTTTTTTTTTTTT",
"AAAAAAAAAAAAAAA\0TTTTTTTTTTTTTTT", false);
"AAAAAAAAAAAAAAAATTTTTTTTTTTTTTT", true);
}

/******************************************************************************/

0 comments on commit 458b5b6

Please sign in to comment.