Skip to content

Commit

Permalink
output format
Browse files Browse the repository at this point in the history
  • Loading branch information
Chao Jiannan committed Jul 17, 2021
1 parent 968ffb4 commit 0f90589
Show file tree
Hide file tree
Showing 15 changed files with 207 additions and 79 deletions.
25 changes: 18 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,32 +4,43 @@ This project is aimed at improving a part of [HAlign](https://github.com/malabz/

## Usage

the command below prints the usage on windows

```
stmsa-x.x.x-win-x64.exe unaligned.fasta output.fasta
stmsa-x.x.x-win-x64.exe --help
```

## Change Log

* 2021-07-15
2021-07-17

improve the output format

2021-07-15

further decrease the space requirement

* 2021-04-25
2021-04-25

self-defined allocator for suffix tree, which reduce the construction time to about 1/3 comparing with ::operator new

* 2021-03-29
2021-03-29

released as a crude but usable msa tool

* 2021-03-11
2021-03-11

generalised suffixtree and needleman-wunsch algorithms implemented

## Build

- msvc
msvc

clang

## Dependencies

- clang
Boost

## License

Expand Down
39 changes: 20 additions & 19 deletions src/PairwiseAlignment/NeedlemanWunsh.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,32 +36,33 @@ namespace pairwise_alignment
NeedlemanWunsh(RandomAccessIterator1 lhs_first, RandomAccessIterator1 lhs_last,
RandomAccessIterator2 rhs_first, RandomAccessIterator2 rhs_last,
unsigned flag, const ScoringMatrixType& scoring_matrix,
int gap_open, int gap_extention):
int gap_open, int gap_extention)

_lhs_first(lhs_first), _lhs_last(lhs_last),
_rhs_first(rhs_first), _rhs_last(rhs_last),
: _lhs_first(lhs_first), _lhs_last(lhs_last)
, _rhs_first(rhs_first), _rhs_last(rhs_last)

_lhs_len(std::distance(lhs_first, lhs_last)),
_rhs_len(std::distance(rhs_first, rhs_last)),
, _lhs_len(std::distance(lhs_first, lhs_last))
, _rhs_len(std::distance(rhs_first, rhs_last))

_scoring_matrix(scoring_matrix),
_l_ending(flag & LEFT_ENDING),
_r_ending(flag & RIGHT_ENDING),
, _scoring_matrix(scoring_matrix)
, _l_ending(flag & LEFT_ENDING)
, _r_ending(flag & RIGHT_ENDING)

_op(gap_open),
_l_op(_l_ending ? 0 : gap_open),
_r_op(_r_ending ? 0 : gap_open),
, _op(gap_open)
, _l_op(_l_ending ? 0 : gap_open)
, _r_op(_r_ending ? 0 : gap_open)

_ex(gap_extention),
_l_ex(_l_ending ? 0 : gap_extention),
_r_ex(_r_ending ? 0 : gap_extention),
, _ex(gap_extention)
, _l_ex(_l_ending ? 0 : gap_extention)
, _r_ex(_r_ending ? 0 : gap_extention)

_dp_matrix(NUM, d2im(_lhs_len + 1, d1im(_rhs_len + 1, 0))),
_pa_matrix(NUM, d2pm(_lhs_len + 1, d1pm(_rhs_len + 1))),
, _dp_matrix(NUM, d2im(_lhs_len + 1, d1im(_rhs_len + 1, 0)))
, _pa_matrix(NUM, d2pm(_lhs_len + 1, d1pm(_rhs_len + 1)))

_lhs_gaps(_lhs_len + 1, 0),
_rhs_gaps(_rhs_len + 1, 0)
{}
, _lhs_gaps(_lhs_len + 1, 0)
, _rhs_gaps(_rhs_len + 1, 0)
{
}

std::tuple<gap_vector_type, gap_vector_type> _align()
{
Expand Down
10 changes: 6 additions & 4 deletions src/PairwiseAlignment/NeedlemanWunshReusable.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -15,10 +15,12 @@ namespace pairwise_alignment
public:
using gap_vector_type = std::vector<size_t>;

NeedlemanWunshReusable(const ScoringMatrixType& scoring_matrix, int gap_open, int gap_extention):
_scoring_matrix(scoring_matrix),
_op(gap_open), _ex(gap_extention),
_lhs_capacity(0), _rhs_capacity(0)
NeedlemanWunshReusable(const ScoringMatrixType& scoring_matrix, int gap_open, int gap_extention)
: _scoring_matrix(scoring_matrix)
, _op(gap_open)
, _ex(gap_extention)
, _lhs_capacity(0)
, _rhs_capacity(0)
{}

~NeedlemanWunshReusable()
Expand Down
12 changes: 6 additions & 6 deletions src/StarAlignment/StarAligner.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@ auto star_alignment::StarAligner::get_gaps(const std::vector<sequence_type> &seq
return StarAligner(sequences)._get_gaps();
}

star_alignment::StarAligner::StarAligner(const std::vector<sequence_type> &sequences):
_sequences(sequences),
_row(_sequences.size()),
_lengths(_set_lengths()),
_centre(_set_centre()),
_centre_len(_centre.size())
star_alignment::StarAligner::StarAligner(const std::vector<sequence_type> &sequences)
: _sequences(sequences)
, _row(_sequences.size())
, _lengths(_set_lengths())
, _centre(_set_centre())
, _centre_len(_centre.size())
{}

std::vector<size_t> star_alignment::StarAligner::_set_lengths() const
Expand Down
2 changes: 2 additions & 0 deletions src/SuffixTree/LeftChildRightSiblingBinaryTree.hpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#pragma once

#include "SuffixTree.hpp"
#include "../Utils/NucleicAcidColumn.hpp"

Expand Down
6 changes: 6 additions & 0 deletions src/Utils/Arguments.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#include "Arguments.hpp"

std::string arguments::in_file_name;
std::string arguments::out_file_name;

bool arguments::output_matrix;
13 changes: 13 additions & 0 deletions src/Utils/Arguments.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
#pragma once

#include <string>

namespace arguments
{

extern std::string in_file_name;
extern std::string out_file_name;

extern bool output_matrix;

}
33 changes: 30 additions & 3 deletions src/Utils/Fasta.cpp
Original file line number Diff line number Diff line change
@@ -1,16 +1,21 @@
#include "Fasta.hpp"

utils::Fasta::Fasta(std::istream& is) { _read(is); }
#include <cstring>

void utils::Fasta::write_to(std::ostream& os, bool with_identification) const
utils::Fasta::Fasta(std::istream &is)
{
_read(is);
}

void utils::Fasta::write_to(std::ostream &os, bool with_identification) const
{
if (with_identification)
write_to(os, sequences.cbegin(), sequences.cend(), identifications.cbegin());
else
write_to(os, sequences.cbegin(), sequences.cend());
}

void utils::Fasta::_read(std::istream& is)
void utils::Fasta::_read(std::istream &is)
{
std::string each_line, each_sequence;

Expand Down Expand Up @@ -38,3 +43,25 @@ void utils::Fasta::_read(std::istream& is)

sequences.push_back(each_sequence);
}

void utils::Fasta::cut_and_write(std::ostream &os, const std::string &sequence)
{
const size_t sequence_length = sequence.size();

char *cut_sequence = new char[sequence_length + sequence_length / max_line_length + 1];
size_t des_index = 0;
for (size_t src_index = 0; src_index < sequence_length; src_index += max_line_length)
{
if (src_index) cut_sequence[des_index++] = '\n';

size_t write_length = sequence_length - src_index;
if (write_length > max_line_length) write_length = max_line_length;

memcpy(cut_sequence + des_index, sequence.data() + src_index, write_length);
des_index += write_length;
}
cut_sequence[des_index] = 0;

os << cut_sequence;
delete[] cut_sequence;
}
31 changes: 18 additions & 13 deletions src/Utils/Fasta.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -10,45 +10,50 @@ namespace utils
class Fasta
{
private:
void _read(std::istream& is);
void _read(std::istream &is);

public:
static constexpr unsigned max_line_length = 80;

std::vector<std::string> sequences;
std::vector<std::string> identifications;

explicit Fasta(std::istream& is);
explicit Fasta(std::istream &is);

void write_to(std::ostream &os, bool with_idification = true) const;

void write_to(std::ostream& os, bool with_idification = true) const;
static void cut_and_write(std::ostream &os, const std::string &sequence);

template<typename InputIterator>
static void write_to(std::ostream& os, InputIterator sequence_first, InputIterator sequence_last)
static void write_to(std::ostream &os, InputIterator sequence_first, InputIterator sequence_last)
{
if (sequence_first == sequence_last) return;

using difference_type = decltype(std::distance(sequence_first, sequence_last));
const difference_type len = std::distance(sequence_first, sequence_last);

for (difference_type i = 0; i != len - 1; ++sequence_first, ++i)
os << *sequence_first << '\n';
os << *sequence_first;
for (difference_type i = 0; i != len; ++sequence_first, ++i)
{
os << *sequence_first;
if (i != len - 1) os << '\n';
}
}

template<typename InputIterator1, typename InputIterator2>
static void write_to(std::ostream& os, InputIterator1 sequence_first, InputIterator1 sequence_last,
InputIterator2 identification_first)
static void write_to(std::ostream &os, InputIterator1 sequence_first, InputIterator1 sequence_last,
InputIterator2 identification_first)
{
if (sequence_first == sequence_last) return;

using difference_type = decltype(std::distance(sequence_first, sequence_last));
const difference_type len = std::distance(sequence_first, sequence_last);

for (difference_type i = 0; i != len - 1; ++sequence_first, ++identification_first, ++i)
for (difference_type i = 0; i != len; ++sequence_first, ++identification_first, ++i)
{
os << '>' << *identification_first << '\n';
os << *sequence_first << '\n';
cut_and_write(os, *sequence_first);
if (i != len - 1) os << '\n';
}
os << '>' << *identification_first << '\n';
os << *sequence_first;
}

};
Expand Down
10 changes: 6 additions & 4 deletions src/Utils/Graph.cpp
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
#include "Graph.hpp"

#include <stdexcept>

utils::AdjacencyList::AdjacencyList(size_t nodes_number):
nodes(nodes_number),
_node_num(nodes_number)
utils::AdjacencyList::AdjacencyList(size_t nodes_number)
: nodes(nodes_number)
, _node_num(nodes_number)
// , _edge_num(0)
{}

void utils::AdjacencyList::add_edge(size_t from, size_t to, unsigned weight)
{
nodes[from].emplace_back(to, weight);
++_edge_num;
// ++_edge_num;
}

std::vector<size_t> utils::AdjacencyList::topological_sort() const
Expand Down
17 changes: 9 additions & 8 deletions src/Utils/Graph.hpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#pragma once

#include <vector>

namespace utils
Expand All @@ -12,9 +14,9 @@ namespace utils
size_t to;
unsigned weight;

edge_type(size_t to, unsigned weight):
to(to),
weight(weight)
edge_type(size_t to, unsigned weight)
: to(to)
, weight(weight)
{}
};

Expand All @@ -24,9 +26,9 @@ namespace utils
size_t from;
unsigned weight;

reverse_edge_type(size_t from, unsigned weight):
from(from),
weight(weight)
reverse_edge_type(size_t from, unsigned weight)
: from(from)
, weight(weight)
{}
};

Expand All @@ -44,9 +46,8 @@ namespace utils
std::vector<size_t> topological_sort() const;

private:

size_t _edge_num;
size_t _node_num;
// size_t _edge_num;
};

// assume that nodes with higher index cannot have an edge linking to a node with lower index
Expand Down
2 changes: 2 additions & 0 deletions src/Utils/Insertion.hpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
#pragma once

#include <cstddef>
#include <iterator>

Expand Down
Loading

0 comments on commit 0f90589

Please sign in to comment.