Skip to content

Commit

Permalink
some cleanup, now wrap output of ML-factors with -p
Browse files Browse the repository at this point in the history
  • Loading branch information
Anton Pirogov committed Jun 19, 2016
1 parent ec640c6 commit 2f9a858
Show file tree
Hide file tree
Showing 12 changed files with 51 additions and 101 deletions.
2 changes: 1 addition & 1 deletion Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ $(TESTS): $(OBJECTS) $(TEST_OBJ)

clean:
$(RM) -r build
$(RM) $(OBJECTS) $(TEST_OBJ) $(TESTS)
$(RM) tests/*.o src/*.o

tests: $(TESTS)
bash ./tests/runtests.sh
Expand Down
3 changes: 1 addition & 2 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,8 +1,7 @@
# dnalc

Experimental local neighborhood complexity estimation for DNA,
based on match-length factorization (derived from Lempel-Ziv-factorization)
and periodicity counting (detection of repetitive DNA).
based on match-length factorization (derived from Lempel-Ziv-factorization).

### Build

Expand Down
2 changes: 1 addition & 1 deletion src/args.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,7 @@ static char const usage[] = PROGNAME
"\t-l: list sequences stored in index file\n"
"\t-n <NUM>: calculate for given sequence within file (default: 0=all)\n"

"\t-p: print match-length and Lempel-Ziv factors and periodicities\n"
"\t-p: print match-length factors\n"
"\t-b: print benchmarking information\n"
"\t-g N: output pipe-ready to plot with:\n"
"\t\tN=1 -> dnalc_plot.sh (-> gnuplot)\n"
Expand Down
5 changes: 4 additions & 1 deletion src/complexity.cpp
Original file line number Diff line number Diff line change
@@ -1,4 +1,7 @@
#include <queue>
#include <cassert>
#include <iostream>
#include <algorithm>
using namespace std;

#include "args.h" //args.p
Expand Down Expand Up @@ -36,7 +39,7 @@ queue<size_t> calcNAWindows(size_t offset, size_t n, size_t w, size_t k,

for
each_window(n, w, k) {
//kick out runs that are now outside of window
//kick out intervals that are now outside of window
for (auto it=bad.begin(); it != bad.end(); it++)
if (it->second < l+offset) {
it = bad.erase(it);
Expand Down
20 changes: 0 additions & 20 deletions src/factors.cpp

This file was deleted.

29 changes: 0 additions & 29 deletions src/factors.h

This file was deleted.

27 changes: 25 additions & 2 deletions src/matchlength.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -7,12 +7,35 @@
#include "shulen.h"
#include <algorithm>
#include <vector>
#include <iostream>
#include <sstream>
using namespace std;

void Fact::print() const {
stringstream ss;
size_t n = fact.size();
for (size_t i = 0; i < n; i++) {
size_t start = this->fact[i];
size_t end = i < n - 1 ? this->fact[i + 1] : this->strLen;
ss << string(this->str + start, end-start) << (i < n - 1 ? "." : "");
}
string s = ss.str();
for (size_t i = 0; i < s.size(); i+=80)
cout << s.substr(i, min(80UL, s.size()-i)) << endl;
}

// length of factor
size_t Fact::factLen(size_t i) const {
Fact const &f = *this;
if (i == 0)
return f.fact[1];
if (i == f.fact.size() - 1)
return f.strLen - f.fact[f.fact.size() - 1];
return f.fact[i + 1] - f.fact[i];
}

void computeMLFact(Fact &mlf, Esa const &esa) {
mlf.prevOcc.clear();
mlf.fact.resize(0);
mlf.lpf.resize(0);
mlf.str = esa.str;
mlf.strLen = esa.n;

Expand Down
18 changes: 17 additions & 1 deletion src/matchlength.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,21 @@
#pragma once
#include "factors.h"
#include <cstddef>
#include <cstdint>
#include <vector>

#include "config.h"
#include "esa.h"

/* a match-length factorization of a string */
struct Fact {
void print() const;
size_t factLen(size_t i) const;

uint_vec fact; /* positions of factors */

char const *str; /* string */
size_t strLen; /* string length */
};


void computeMLFact(Fact &fact, Esa const &esa);
23 changes: 0 additions & 23 deletions src/util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,17 +51,6 @@ string randSeq(size_t n, string alphabet) {
return s;
}

string randRun(size_t n, size_t l, string alphabet) {
if (l > n)
l = n;
string s;
string rep = randSeq(l, alphabet);
while (s.size() < n)
s += rep;
s.resize(n);
return s;
}

// calculate the GC content
double gcContent(string const &s) {
size_t gc = 0;
Expand Down Expand Up @@ -171,15 +160,3 @@ bool with_mmap(char const *file, function<bool(MMapReader&)> lambda) {
return ret;
}

// fprintnf: print max of n characters of str onto fp;
// add ... if str was truncated
void fprintnf(FILE *fp, char const *str, int n) {
int i, l, m;
l = strlen(str);
m = n < l ? n : l;
for (i = 0; i < m; i++)
fprintf(fp, "%c", str[i]);
if (m < l)
fprintf(fp, "...");
}

2 changes: 0 additions & 2 deletions src/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,10 @@

std::string randSeq(size_t n, std::string alphabet = "ACGT");
std::string randSeq(size_t n, double gc);
std::string randRun(size_t n, size_t l, std::string alphabet = "ACGT");
double gcContent(std::string const &s);
std::string revComp(std::string const &s);

std::string base_name(std::string const & path);
void fprintnf(FILE *fp, char const *str, int n);
int open_or_fail(char const *fname, int flag);
FILE *fopen_or_fail(char const *fname, char const *flags);
bool with_file(char const *file, std::function<bool(std::istream&)> lambda, std::ios_base::openmode mode=std::ios_base::in);
Expand Down
5 changes: 2 additions & 3 deletions tests/matchlength_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,9 @@ void checkML(string seq, string facts[], size_t num) {
computeMLFact(mlf, esa);
mu_assert(mlf.fact.size() == num, "wrong number of ML factors");

for (size_t i = 0; i < mlf.fact.size(); i++) {
mu_assert(!strncmp(mlf.str + mlf.fact[i], facts[i].c_str(), factLen(mlf, i)),
for (size_t i = 0; i < mlf.fact.size(); i++)
mu_assert(!strncmp(mlf.str + mlf.fact[i], facts[i].c_str(), mlf.factLen(i)),
"wrong factor");
}
}

void test_MatchLength1() { return checkML(seq1, factors1, 8); }
Expand Down
16 changes: 0 additions & 16 deletions tests/util_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -30,21 +30,6 @@ void test_randSeq() {
mu_assert(alphabet.find(c) != string::npos, "invalid character: " << c);
}

void test_randRun() {
size_t n = 1000;
for (auto l : vector<size_t>{1, 2, 4, 8, 16}) {
string runseq = randRun(n, l);
string per = runseq.substr(0, l);
size_t cnt = 0;
size_t pos = 0;
while (runseq.find(per, pos) != string::npos) {
cnt++;
pos += l;
}
mu_assert_eq(n / l, cnt, "wrong number of repeats");
}
}

// test reverse complement. assumes normalized sequence, so just ACGT toggle
// and whole string is reversed.
void test_revComp() {
Expand All @@ -55,7 +40,6 @@ void test_revComp() {

void all_tests() {
mu_run_test(test_randSeq);
mu_run_test(test_randRun);
mu_run_test(test_revComp);
}
RUN_TESTS(all_tests)

0 comments on commit 2f9a858

Please sign in to comment.