diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 325609c..64f23d7 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -47,7 +47,7 @@ jobs: run: | mkdir build cd build - cmake -DCMAKE_C_COMPILER=gcc-11 -DCMAKE_CXX_COMPILER=g++-11 .. + cmake -DCMAKE_C_COMPILER=gcc-11 -DCMAKE_CXX_COMPILER=g++-11 -DSKIP_PYTHON=1 .. make make test diff --git a/.gitignore b/.gitignore index e8b352a..b83277b 100644 --- a/.gitignore +++ b/.gitignore @@ -11,3 +11,4 @@ python/docs/_generated __pycache__ cobs*.tar cobs*.tar.gz +!tests/data/non_acgt_test/index.cobs_classic diff --git a/cobs/util/query.cpp b/cobs/util/query.cpp index 98dd82d..6ab9256 100644 --- a/cobs/util/query.cpp +++ b/cobs/util/query.cpp @@ -140,44 +140,44 @@ void destroy_mmap(MMapHandle& handle) close_file(handle.fd); } -//! forward character map. A -> A, C -> C, G -> G, T -> T. rest maps to zero. +//! forward character map. A -> A, C -> C, G -> G, T -> T. rest also maps to A to handle non-ACGT bases and not error out. static const char canonicalize_basepair_forward_map[256] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 65, 0, 67, 0, 0, 0, 71, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 84, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 67, 65, 65, 65, 71, 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 84, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, + 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, 65, }; -//! reverse character map. A -> T, C -> G, G -> C, T -> A. rest maps to zero. +//! reverse character map. A -> T, C -> G, G -> C, T -> A. rest also maps to T to handle non-ACGT bases and not error out. static const char canonicalize_basepair_reverse_map[256] = { - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 84, 0, 71, 0, 0, 0, 67, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 65, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 71, 84, 84, 84, 67, 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 65, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, + 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, }; bool canonicalize_kmer(const char* input, char* output, uint64_t size) diff --git a/tests/data/non_acgt_test/index.cobs_classic b/tests/data/non_acgt_test/index.cobs_classic new file mode 100644 index 0000000..2caef36 Binary files /dev/null and b/tests/data/non_acgt_test/index.cobs_classic differ diff --git a/tests/data/non_acgt_test/test_1.fastq.gz b/tests/data/non_acgt_test/test_1.fastq.gz new file mode 100644 index 0000000..b4bdd21 Binary files /dev/null and b/tests/data/non_acgt_test/test_1.fastq.gz differ diff --git a/tests/data/non_acgt_test/truth.out b/tests/data/non_acgt_test/truth.out new file mode 100644 index 0000000..f0544c7 --- /dev/null +++ b/tests/data/non_acgt_test/truth.out @@ -0,0 +1,184 @@ +*ERR5069949.2151832 1 +genome 120 +*ERR5069949.576388 1 +genome 47 +*ERR5069949.501486 1 +genome 116 +*ERR5069949.1331889 1 +genome 102 +*ERR5069949.2161340 1 +genome 50 +*ERR5069949.973930 0 +*ERR5069949.2417063 1 +genome 120 +*ERR5069949.376959 1 +genome 121 +*ERR5069949.1088785 0 +*ERR5069949.1066259 1 +genome 117 +*ERR5069949.2832676 1 +genome 109 +*ERR5069949.2953930 1 +genome 121 +*ERR5069949.324865 0 +*ERR5069949.2185111 1 +genome 104 +*ERR5069949.937422 1 +genome 100 +*ERR5069949.2431709 0 +*ERR5069949.1246538 1 +genome 118 +*ERR5069949.1189252 1 +genome 68 +*ERR5069949.2216307 1 +genome 98 +*ERR5069949.3273002 1 +genome 97 +*ERR5069949.3277445 1 +genome 121 +*ERR5069949.3022231 1 +genome 117 +*ERR5069949.184542 0 +*ERR5069949.540529 1 +genome 119 +*ERR5069949.686090 1 +genome 114 +*ERR5069949.2787556 1 +genome 63 +*ERR5069949.2650879 1 +genome 120 +*ERR5069949.2064910 1 +genome 102 +*ERR5069949.2328704 1 +genome 120 +*ERR5069949.1067032 1 +genome 99 +*ERR5069949.3338256 1 +genome 121 +*ERR5069949.1412839 1 +genome 117 +*ERR5069949.1538968 1 +genome 120 +*ERR5069949.147998 1 +genome 64 +*ERR5069949.366975 1 +genome 76 +*ERR5069949.1372331 1 +genome 121 +*ERR5069949.1709367 1 +genome 94 +*ERR5069949.2388984 1 +genome 120 +*ERR5069949.1132353 1 +genome 120 +*ERR5069949.1151736 1 +genome 121 +*ERR5069949.479807 1 +genome 120 +*ERR5069949.2176303 1 +genome 121 +*ERR5069949.2772897 1 +genome 117 +*ERR5069949.1020777 1 +genome 92 +*ERR5069949.465452 1 +genome 98 +*ERR5069949.1704586 1 +genome 119 +*ERR5069949.1258508 1 +genome 121 +*ERR5069949.986441 0 +*ERR5069949.2674295 1 +genome 118 +*ERR5069949.885966 0 +*ERR5069949.2342766 1 +genome 121 +*ERR5069949.3122970 1 +genome 97 +*ERR5069949.3279513 0 +*ERR5069949.309410 1 +genome 121 +*ERR5069949.532979 1 +genome 119 +*ERR5069949.2888794 1 +genome 121 +*ERR5069949.2205229 1 +genome 112 +*ERR5069949.786562 1 +genome 121 +*ERR5069949.919671 1 +genome 121 +*ERR5069949.1328186 1 +genome 121 +*ERR5069949.870926 1 +genome 119 +*ERR5069949.2257580 1 +genome 121 +*ERR5069949.3249622 0 +*ERR5069949.611123 1 +genome 95 +*ERR5069949.651338 0 +*ERR5069949.169513 1 +genome 62 +*ERR5069949.155944 0 +*ERR5069949.2033605 1 +genome 120 +*ERR5069949.2730382 1 +genome 112 +*ERR5069949.2125592 1 +genome 120 +*ERR5069949.1062611 1 +genome 121 +*ERR5069949.1778133 1 +genome 117 +*ERR5069949.3057020 1 +genome 60 +*ERR5069949.2972968 0 +*ERR5069949.2734474 1 +genome 119 +*ERR5069949.856527 1 +genome 100 +*ERR5069949.2098070 1 +genome 121 +*ERR5069949.1552198 1 +genome 120 +*ERR5069949.2385514 1 +genome 120 +*ERR5069949.2270078 1 +genome 121 +*ERR5069949.114870 1 +genome 120 +*ERR5069949.2668880 1 +genome 117 +*ERR5069949.257821 1 +genome 109 +*ERR5069949.2243023 1 +genome 120 +*ERR5069949.2605155 1 +genome 116 +*ERR5069949.1340552 1 +genome 121 +*ERR5069949.1561137 1 +genome 120 +*ERR5069949.2361683 1 +genome 119 +*ERR5069949.2521353 0 +*ERR5069949.1261808 0 +*ERR5069949.2734873 1 +genome 68 +*ERR5069949.3017828 1 +genome 77 +*ERR5069949.573706 1 +genome 97 +*ERR5069949.1980512 1 +genome 121 +*ERR5069949.1014693 1 +genome 120 +*ERR5069949.3184655 1 +genome 120 +*ERR5069949.29668 0 +*ERR5069949.3258358 0 +*ERR5069949.1476386 1 +genome 121 +*ERR5069949.2415814 1 +genome 120 diff --git a/tests/non_acgt_test.cpp b/tests/non_acgt_test.cpp new file mode 100644 index 0000000..b0186af --- /dev/null +++ b/tests/non_acgt_test.cpp @@ -0,0 +1,35 @@ +#include "test_util.hpp" +#include +#include + +namespace fs = cobs::fs; + +static fs::path base_dir = "data/non_acgt_test"; +static fs::path work_dir = base_dir / "work"; + +class non_acgt_test : public ::testing::Test +{ +protected: + static void SetUpTestSuite() { + cobs::error_code ec; + fs::create_directories(work_dir, ec); + } + static void TearDownTestSuite() { + cobs::error_code ec; + fs::remove_all(work_dir, ec); + } +}; + + +TEST_F(non_acgt_test, non_acgt_test_main_test) { + fs::path index_file{base_dir / "index.cobs_classic"}; + cobs::ClassicSearch s(index_file.string()); + fs::path query_file{base_dir / "test_1.fastq.gz"}; + auto query_out_filepath = work_dir / "query.out"; + std::ofstream query_out_fh(query_out_filepath.string()); + cobs::process_query(s, 0.80000000000000004, 0, "", query_file.string(), query_out_fh); + query_out_fh.close(); + assert_equals_files_with_paths( + base_dir / "truth.out", + query_out_filepath); +} diff --git a/tests/util.cpp b/tests/util.cpp index 046a17c..4c09875 100644 --- a/tests/util.cpp +++ b/tests/util.cpp @@ -50,13 +50,13 @@ TEST(util, kmer_canonicalize) { // one kmer already canonical but containing invalid letters test_kmer("AGGAAAGTCTTTTACGCTGGGXXXAGAGTGA", - "AGGAAAGTCTTTTACGCTGGG\0\0\0AGAGTGA", false); + "AGGAAAGTCTTTTACGCTGGGAAAAGAGTGA", true); // one k-mer needing flipping containing invalid letters test_kmer("TGGAAAGTCTTTTACGCTGGGXXXAGAGTGA", - "TCACTCT\0\0\0CCCAGCGTAAAAGACTTTCCA", false); + "TCACTCTTTTCCCAGCGTAAAAGACTTTCCA", true); // one kmer containing the invalid letter at the center test_kmer("AAAAAAAAAAAAAAAXTTTTTTTTTTTTTTT", - "AAAAAAAAAAAAAAA\0TTTTTTTTTTTTTTT", false); + "AAAAAAAAAAAAAAAATTTTTTTTTTTTTTT", true); } /******************************************************************************/