Skip to content

Commit

Permalink
Refactored, allow region renaming
Browse files Browse the repository at this point in the history
  • Loading branch information
Anton Pirogov committed Oct 6, 2016
1 parent 8c82b53 commit 9f92a06
Show file tree
Hide file tree
Showing 8 changed files with 165 additions and 66 deletions.
39 changes: 36 additions & 3 deletions src/args.cpp
Original file line number Diff line number Diff line change
@@ -1,22 +1,25 @@
#include <iostream>
#include <algorithm>
#include <string>
using namespace std;

#include "args.h"
#include "index.h"
#include "util.h"
#include <getopt.h>

// globally accessible arguments for convenience
Args args;

static char const opts_short[] = "hw:k:isln:f:pgb";
static char const opts_short[] = "hw:k:islr:n:f:pgb";
static struct option const opts[] = {
{"help", no_argument, nullptr, 'h'},
{"window-size", required_argument, nullptr, 'w'},
{"window-interval", required_argument, nullptr, 'k'},
{"load-index", no_argument, nullptr, 'i'},
{"save-index", no_argument, nullptr, 's'},
{"list-index", no_argument, nullptr, 'l'},
{"rename-regions", required_argument, nullptr, 'r'},
{"seq", required_argument, nullptr, 'n'},
{"batchfile", required_argument, nullptr, 'f'},
{"print-factors", no_argument, nullptr, 'p'},
Expand All @@ -36,6 +39,7 @@ static char const usage[] = PROGNAME
"\t-i: use index file instead of FASTA sequence file\n"
"\t-s: output index file for further processing (no regular result)\n"
"\t-l: list sequences stored in index file\n"
"\t-r FILE: file that contains a list of regions to process\n"
"\t-n IDX:FROM-TO: calculate for given sequence and region within file\n"
"\t (defaults: IDX=0, FROM=0, TO=end of whole sequence. valid syntax: IDX | IDX:FROM-TO)\n"
"\t-f FILE: file that contains a list of regions to process\n"
Expand Down Expand Up @@ -87,6 +91,7 @@ Task::Task(string str) {
void Args::parse(int argc, char *argv[]) {
int c = 0; // getopt stores value returned (last struct component) here
int opt_idx = 0; // getopt stores the option index here.
vector<string> names;
while ((c = getopt_long(argc, argv, opts_short, opts, &opt_idx)) != -1) {
switch (c) {
case 0: // long option without a short name
Expand Down Expand Up @@ -124,7 +129,7 @@ void Args::parse(int argc, char *argv[]) {
cerr << "ERROR: -f incompatible with -n!" << endl;
exit(1);
}
if (!with_file(optarg, [&](istream &in){
if (!with_file_in(optarg, [&](istream &in){
string line;
int num=0;
while (in >> line) {
Expand All @@ -134,7 +139,35 @@ void Args::parse(int argc, char *argv[]) {
}
return true;
}))
exit(1);
exit(1);
break;
case 'r':
if (!with_file_in(optarg, [&](istream &in){
string line;
while (getline(in, line)) {
if (line.size()>MAX_LABEL_LEN) {
cerr << "ERROR: Name list contains too long names!" << endl;
exit(1);
}
if (line.find_first_of("\t\n ")!=string::npos) {
cerr << "ERROR: Name list contains names with whitespace!" << endl;
exit(1);
}
names.push_back(line);
}
return true;
})) {
exit(1);
}
args.newnames = names;
sort(names.begin(),names.end());
for (size_t j=0; j<names.size()-1; j++) {
if (names[i]==names[j+1]) {
cerr << "ERROR: new names are not unique!" << endl;
exit(1);
}
}

break;

case 'p':
Expand Down
1 change: 1 addition & 0 deletions src/args.h
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,7 @@ struct Args {
bool s = false; // output index
bool l = false; // list contents of index
std::vector<Task> tasks; // number of sequence/region (+ offsets) in index file to work on
std::vector<std::string> newnames; //new names for regions -> rename regions in index

bool p = false; // print match length decomposition?
bool g = false; // output for ./dnalc_plot.sh
Expand Down
13 changes: 11 additions & 2 deletions src/dnalc.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -71,7 +71,7 @@ void printResults(int64_t idx, vector<pair<size_t,size_t>> &regs, size_t w, size
bool check_unique_names(FastaFile &ff) {
vector<string> names;
for (auto &seq : ff.seqs)
names.push_back(seq.name.substr(0,32));
names.push_back(seq.name.substr(0,MAX_LABEL_LEN));
sort(names.begin(),names.end());
for (size_t i=0; i<names.size()-1; i++)
if (names[i]==names[i+1])
Expand Down Expand Up @@ -140,10 +140,19 @@ int main(int argc, char *argv[]) {
args.parse(argc, argv);
cout << fixed << setprecision(4);

if (args.newnames.size()>0) {
if (args.num_files==0) {
cerr << "ERROR: No index file provided!" << endl;
return EXIT_FAILURE;
}
renameRegions(args.files[0], args.newnames);
return EXIT_SUCCESS;
}

tick();
if (args.num_files == 0)
processFile(nullptr); //from stdin
else
else
processFile(args.files[0]);
tock("total time");
}
122 changes: 80 additions & 42 deletions src/index.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -27,50 +27,45 @@ template<typename T> void binread(MMapReader &i, T &x) {
i.off += sizeof(x);
}

// serialize relevant data of a sequence
void serialize(ostream &o, ComplexityData const &cd) {
assert(cd.regions.size() == cd.labels.size());
const string magicstr = "BINIDX";

binwrite(o, (size_t)cd.name.size());
for (auto c : cd.name)
binwrite(o, c);
binwrite(o, cd.len);
binwrite(o, cd.gc);
bool saveData(ComplexityData &cd, char const *file) {
assert(cd.regions.size() == cd.labels.size());
return with_file_out(file, [&](ostream &o) {
for (auto c : magicstr) //magic sequence
binwrite(o, c);

binwrite(o, (size_t)cd.regions.size());
for (auto i : cd.regions){
binwrite(o, i.first);
binwrite(o, i.second);
}
for (auto &l : cd.labels){
binwrite(o, l.size());
for (char c : l)
binwrite(o, (size_t)cd.name.size());
for (auto c : cd.name)
binwrite(o, c);
}
binwrite(o, cd.len);
binwrite(o, cd.gc);

binwrite(o, cd.numbad);
binwrite(o, (size_t)cd.bad.size());
for (auto i : cd.bad){
binwrite(o, i.first);
binwrite(o, i.second);
}
binwrite(o, (size_t)cd.regions.size());
for (auto i : cd.regions){
binwrite(o, i.first);
binwrite(o, i.second);
}
for (auto &l : cd.labels) {
binwrite(o, l.size());
for (size_t i=0; i<MAX_LABEL_LEN; i++)
binwrite(o, i < l.size() ? l[i] : (char)0);
}

binwrite(o, (size_t)cd.fstRegionFact.size());
for (auto i : cd.fstRegionFact)
binwrite(o, i);
binwrite(o, (size_t)cd.mlf.size());
for (auto i : cd.mlf)
binwrite(o, i);
}
binwrite(o, cd.numbad);
binwrite(o, (size_t)cd.bad.size());
for (auto i : cd.bad){
binwrite(o, i.first);
binwrite(o, i.second);
}

const string magicstr = "BINIDX";
binwrite(o, (size_t)cd.fstRegionFact.size());
for (auto i : cd.fstRegionFact)
binwrite(o, i);
binwrite(o, (size_t)cd.mlf.size());
for (auto i : cd.mlf)
binwrite(o, i);

// serialize a series of sequences
bool saveData(ComplexityData &dat, char const *file) {
return with_file_out(file, [&](ostream &o) {
for (auto c : magicstr) //magic sequence
binwrite(o, c);
serialize(o, dat);
return true;
});
}
Expand All @@ -82,8 +77,8 @@ bool loadData(ComplexityData &dat, char const *file, bool onlyInfo) {
<< " Please pass it as argument!" << endl;
return false;
}
// return with_file(file, [&](istream &fin) {
return with_mmap(file, [&](MMapReader &fin) {
return with_file_in(file, [&](istream &fin) {
// return with_mmap(file, [&](MMapReader &fin) {
char tmp;
for (size_t i = 0; i<magicstr.size(); i++) {
binread(fin, tmp);
Expand Down Expand Up @@ -115,9 +110,10 @@ bool loadData(ComplexityData &dat, char const *file, bool onlyInfo) {
for (size_t j = 0; j < rnum; j++) {
size_t lbllen;
binread(fin,lbllen);
for (size_t k=0; k<lbllen; k++) {
for (size_t k=0; k<MAX_LABEL_LEN; k++) {
binread(fin,tmp);
dat.labels[j] += tmp;
if (k < lbllen)
dat.labels[j] += tmp;
}
}

Expand Down Expand Up @@ -159,7 +155,49 @@ bool loadData(ComplexityData &dat, char const *file, bool onlyInfo) {
}

return true;
}); //, ios::binary);
}, ios::in|ios::binary);
}

bool renameRegions(char const *file, vector<string> const &names) {
if (!file) {
cerr << "ERROR: Can not load binary index file from pipe!"
<< " Please pass it as argument!" << endl;
return false;
}
return with_file(file, [&](fstream &fs) {
char tmp;
for (size_t i = 0; i<magicstr.size(); i++) {
binread(fs, tmp);
if (tmp != magicstr[i]) {
cerr << "ERROR: This does not look like an index file!" << endl;
return false;
}
}
size_t tmpsz;
binread(fs,tmpsz);
for (size_t j=0; j<tmpsz; j++)
binread(fs,tmp);
double gc;
binread(fs,tmpsz);
binread(fs,gc);

size_t rnum;
binread(fs,rnum);
if (rnum != names.size()) {
cerr << "ERROR: number of given names and regions does not match!" << endl;
return false;
}
for (size_t j = 0; j < rnum; j++) {
binread(fs,tmpsz);
binread(fs,tmpsz);
}
for (auto &s : names) {
binwrite(fs, s.size());
for (size_t i=0; i<MAX_LABEL_LEN; i++)
binwrite(fs, i < s.size() ? s[i] : (char)0);
}
return true;
}, fstream::in|fstream::out|fstream::binary);
}

// given sequences from a fasta file, calculate match factors and runs
Expand Down
3 changes: 3 additions & 0 deletions src/index.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@ struct ComplexityData {
std::vector<size_t> mlf; // match factors
};

const size_t MAX_LABEL_LEN = 32;

bool loadData(ComplexityData &cplx, char const *file, bool onlyInfo=false);
bool saveData(ComplexityData &cplx, char const *file);
bool renameRegions(char const *file, vector<string> const &names);

void extractData(ComplexityData &cplx, FastaFile &file);
24 changes: 15 additions & 9 deletions src/util.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
#include <random>
#include <chrono>
#include <cstring>
#include <functional>
#include <iostream>
#include <fstream>

Expand Down Expand Up @@ -111,29 +110,36 @@ FILE *fopen_or_fail(char const *fname, char const *flags) {

// if file==nullptr, calls lambda with stdin as stream, otherwise opens file, auto-closes
template<typename T>
bool with_file(char const *file, function<bool(T&)> lambda, ios_base::openmode mode, T& def) {
T *streamP = &def;
fstream fs;
bool with_file(char const *file, std::function<bool(T&)> lambda, std::ios_base::openmode mode, T* def=nullptr) {
T *streamP = def;
std::fstream fs;
if (file) {
// fs = ifstream(file); //does not work with older g++?
fs.open(file, mode);
if (!fs.is_open()) {
cerr << "ERROR: Could not open file: " << file << endl;
std::cerr << "ERROR: Could not open file: " << file << std::endl;
return false;
}
streamP = &fs;
}
if (streamP == nullptr) {
std::cerr << "ERROR: Invalid default file stream!" << std::endl;
return false;
}
T &stream = *streamP;
bool ret = lambda(stream);
if (fs.is_open())
fs.close();
return ret;
}
bool with_file(char const *file, function<bool(istream&)> lambda, ios_base::openmode mode) {
return with_file(file, lambda, mode, cin);

bool with_file(char const *file, function<bool(fstream&)> lambda, ios_base::openmode mode) {
return with_file(file, lambda, mode, static_cast<fstream*>(nullptr));
}
bool with_file_in(char const *file, function<bool(istream&)> lambda, ios_base::openmode mode) {
return with_file(file, lambda, ios_base::in|mode, &cin);
}
bool with_file_out(char const *file, function<bool(ostream&)> lambda, ios_base::openmode mode) {
return with_file(file, lambda, mode, cout);
return with_file(file, lambda, ios_base::out|mode, &cout);
}

size_t getFilesize(const char* filename) {
Expand Down
4 changes: 3 additions & 1 deletion src/util.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,10 @@ std::string revComp(std::string const &s);
std::string base_name(std::string const & path);
int open_or_fail(char const *fname, int flag);
FILE *fopen_or_fail(char const *fname, char const *flags);
bool with_file(char const *file, std::function<bool(std::istream&)> lambda, std::ios_base::openmode mode=std::ios_base::in);

//overloading does not work here with old g++
bool with_file(char const *file, std::function<bool(std::fstream&)> lambda, std::ios_base::openmode mode);
bool with_file_in(char const *file, std::function<bool(std::istream&)> lambda, std::ios_base::openmode mode=std::ios_base::in);
bool with_file_out(char const *file, std::function<bool(std::ostream&)> lambda, std::ios_base::openmode mode=std::ios_base::out);

struct MMapReader {
Expand Down
Loading

0 comments on commit 9f92a06

Please sign in to comment.