From 2f9a8580666e732b2d81ecbe28180c780025f0e2 Mon Sep 17 00:00:00 2001 From: Anton Pirogov Date: Sun, 19 Jun 2016 15:06:34 +0200 Subject: [PATCH] some cleanup, now wrap output of ML-factors with -p --- Makefile | 2 +- README.md | 3 +-- src/args.cpp | 2 +- src/complexity.cpp | 5 ++++- src/factors.cpp | 20 -------------------- src/factors.h | 29 ----------------------------- src/matchlength.cpp | 27 +++++++++++++++++++++++++-- src/matchlength.h | 18 +++++++++++++++++- src/util.cpp | 23 ----------------------- src/util.h | 2 -- tests/matchlength_tests.cpp | 5 ++--- tests/util_tests.cpp | 16 ---------------- 12 files changed, 51 insertions(+), 101 deletions(-) delete mode 100644 src/factors.cpp delete mode 100644 src/factors.h diff --git a/Makefile b/Makefile index f2db037..124d973 100644 --- a/Makefile +++ b/Makefile @@ -54,7 +54,7 @@ $(TESTS): $(OBJECTS) $(TEST_OBJ) clean: $(RM) -r build - $(RM) $(OBJECTS) $(TEST_OBJ) $(TESTS) + $(RM) tests/*.o src/*.o tests: $(TESTS) bash ./tests/runtests.sh diff --git a/README.md b/README.md index faabc07..da3a39a 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,7 @@ # dnalc Experimental local neighborhood complexity estimation for DNA, -based on match-length factorization (derived from Lempel-Ziv-factorization) -and periodicity counting (detection of repetitive DNA). +based on match-length factorization (derived from Lempel-Ziv-factorization). ### Build diff --git a/src/args.cpp b/src/args.cpp index c915591..70d4047 100644 --- a/src/args.cpp +++ b/src/args.cpp @@ -38,7 +38,7 @@ static char const usage[] = PROGNAME "\t-l: list sequences stored in index file\n" "\t-n : calculate for given sequence within file (default: 0=all)\n" - "\t-p: print match-length and Lempel-Ziv factors and periodicities\n" + "\t-p: print match-length factors\n" "\t-b: print benchmarking information\n" "\t-g N: output pipe-ready to plot with:\n" "\t\tN=1 -> dnalc_plot.sh (-> gnuplot)\n" diff --git a/src/complexity.cpp b/src/complexity.cpp index 423e7ad..0e2c342 100644 --- a/src/complexity.cpp +++ b/src/complexity.cpp @@ -1,4 +1,7 @@ #include +#include +#include +#include using namespace std; #include "args.h" //args.p @@ -36,7 +39,7 @@ queue calcNAWindows(size_t offset, size_t n, size_t w, size_t k, for each_window(n, w, k) { - //kick out runs that are now outside of window + //kick out intervals that are now outside of window for (auto it=bad.begin(); it != bad.end(); it++) if (it->second < l+offset) { it = bad.erase(it); diff --git a/src/factors.cpp b/src/factors.cpp deleted file mode 100644 index a92cd44..0000000 --- a/src/factors.cpp +++ /dev/null @@ -1,20 +0,0 @@ -#include -using namespace std; - -#include "factors.h" - -void Fact::print() const { - char tmp; - size_t n = fact.size(); - char *s = (char *)this->str; // we write there, but restore it back! - for (size_t i = 0; i < n; i++) { - size_t start = this->fact[i]; - size_t end = i < n - 1 ? this->fact[i + 1] : this->strLen; - tmp = this->str[end]; - s[end] = '\0'; - cout << this->str + start << (i < n - 1 ? "." : "\n"); - s[end] = tmp; - } -} - -extern inline size_t factLen(Fact &f, size_t i); diff --git a/src/factors.h b/src/factors.h deleted file mode 100644 index 5fa2c44..0000000 --- a/src/factors.h +++ /dev/null @@ -1,29 +0,0 @@ -#pragma once -#include -#include -#include - -#include "config.h" - -/* a factorization of a string (LempelZiv or MatchLength) */ -struct Fact { - void print() const; - - uint_vec fact; /* positions of factors */ - uint_vec lpf; /* lpf in case of Lempel-Ziv, otherwise empty */ - std::vector prevOcc; /* LZ: prevOcc array (previous occurence of factor s_j) */ - - char const *str; /* string */ - size_t strLen; /* string length */ - - double cObs, cMax, cMin, cNor; -}; - -// length of factor -inline size_t factLen(Fact const &f, size_t i) { - if (i == 0) - return f.fact[1]; - if (i == f.fact.size() - 1) - return f.strLen - f.fact[f.fact.size() - 1]; - return f.fact[i + 1] - f.fact[i]; -} diff --git a/src/matchlength.cpp b/src/matchlength.cpp index c1b44d1..bbfb25c 100644 --- a/src/matchlength.cpp +++ b/src/matchlength.cpp @@ -7,12 +7,35 @@ #include "shulen.h" #include #include +#include +#include using namespace std; +void Fact::print() const { + stringstream ss; + size_t n = fact.size(); + for (size_t i = 0; i < n; i++) { + size_t start = this->fact[i]; + size_t end = i < n - 1 ? this->fact[i + 1] : this->strLen; + ss << string(this->str + start, end-start) << (i < n - 1 ? "." : ""); + } + string s = ss.str(); + for (size_t i = 0; i < s.size(); i+=80) + cout << s.substr(i, min(80UL, s.size()-i)) << endl; +} + +// length of factor +size_t Fact::factLen(size_t i) const { + Fact const &f = *this; + if (i == 0) + return f.fact[1]; + if (i == f.fact.size() - 1) + return f.strLen - f.fact[f.fact.size() - 1]; + return f.fact[i + 1] - f.fact[i]; +} + void computeMLFact(Fact &mlf, Esa const &esa) { - mlf.prevOcc.clear(); mlf.fact.resize(0); - mlf.lpf.resize(0); mlf.str = esa.str; mlf.strLen = esa.n; diff --git a/src/matchlength.h b/src/matchlength.h index affa942..c7783cd 100644 --- a/src/matchlength.h +++ b/src/matchlength.h @@ -1,5 +1,21 @@ #pragma once -#include "factors.h" +#include +#include +#include + +#include "config.h" #include "esa.h" +/* a match-length factorization of a string */ +struct Fact { + void print() const; + size_t factLen(size_t i) const; + + uint_vec fact; /* positions of factors */ + + char const *str; /* string */ + size_t strLen; /* string length */ +}; + + void computeMLFact(Fact &fact, Esa const &esa); diff --git a/src/util.cpp b/src/util.cpp index 73e0a6e..eed1e86 100644 --- a/src/util.cpp +++ b/src/util.cpp @@ -51,17 +51,6 @@ string randSeq(size_t n, string alphabet) { return s; } -string randRun(size_t n, size_t l, string alphabet) { - if (l > n) - l = n; - string s; - string rep = randSeq(l, alphabet); - while (s.size() < n) - s += rep; - s.resize(n); - return s; -} - // calculate the GC content double gcContent(string const &s) { size_t gc = 0; @@ -171,15 +160,3 @@ bool with_mmap(char const *file, function lambda) { return ret; } -// fprintnf: print max of n characters of str onto fp; -// add ... if str was truncated -void fprintnf(FILE *fp, char const *str, int n) { - int i, l, m; - l = strlen(str); - m = n < l ? n : l; - for (i = 0; i < m; i++) - fprintf(fp, "%c", str[i]); - if (m < l) - fprintf(fp, "..."); -} - diff --git a/src/util.h b/src/util.h index 3b82bd0..cb2969d 100644 --- a/src/util.h +++ b/src/util.h @@ -8,12 +8,10 @@ std::string randSeq(size_t n, std::string alphabet = "ACGT"); std::string randSeq(size_t n, double gc); -std::string randRun(size_t n, size_t l, std::string alphabet = "ACGT"); double gcContent(std::string const &s); std::string revComp(std::string const &s); std::string base_name(std::string const & path); -void fprintnf(FILE *fp, char const *str, int n); int open_or_fail(char const *fname, int flag); FILE *fopen_or_fail(char const *fname, char const *flags); bool with_file(char const *file, std::function lambda, std::ios_base::openmode mode=std::ios_base::in); diff --git a/tests/matchlength_tests.cpp b/tests/matchlength_tests.cpp index 37a7274..0926cc9 100644 --- a/tests/matchlength_tests.cpp +++ b/tests/matchlength_tests.cpp @@ -27,10 +27,9 @@ void checkML(string seq, string facts[], size_t num) { computeMLFact(mlf, esa); mu_assert(mlf.fact.size() == num, "wrong number of ML factors"); - for (size_t i = 0; i < mlf.fact.size(); i++) { - mu_assert(!strncmp(mlf.str + mlf.fact[i], facts[i].c_str(), factLen(mlf, i)), + for (size_t i = 0; i < mlf.fact.size(); i++) + mu_assert(!strncmp(mlf.str + mlf.fact[i], facts[i].c_str(), mlf.factLen(i)), "wrong factor"); - } } void test_MatchLength1() { return checkML(seq1, factors1, 8); } diff --git a/tests/util_tests.cpp b/tests/util_tests.cpp index ed9bb88..b79ff67 100644 --- a/tests/util_tests.cpp +++ b/tests/util_tests.cpp @@ -30,21 +30,6 @@ void test_randSeq() { mu_assert(alphabet.find(c) != string::npos, "invalid character: " << c); } -void test_randRun() { - size_t n = 1000; - for (auto l : vector{1, 2, 4, 8, 16}) { - string runseq = randRun(n, l); - string per = runseq.substr(0, l); - size_t cnt = 0; - size_t pos = 0; - while (runseq.find(per, pos) != string::npos) { - cnt++; - pos += l; - } - mu_assert_eq(n / l, cnt, "wrong number of repeats"); - } -} - // test reverse complement. assumes normalized sequence, so just ACGT toggle // and whole string is reversed. void test_revComp() { @@ -55,7 +40,6 @@ void test_revComp() { void all_tests() { mu_run_test(test_randSeq); - mu_run_test(test_randRun); mu_run_test(test_revComp); } RUN_TESTS(all_tests)