Skip to content

Commit

Permalink
Merge pull request #43 from Illumina/develop
Browse files Browse the repository at this point in the history
ExpansionHunter-v3.0.0-rc2
  • Loading branch information
egor-dolzhenko authored Jan 18, 2019
2 parents 082ce85 + ee299f6 commit d5fde05
Show file tree
Hide file tree
Showing 237 changed files with 2,199 additions and 2,095 deletions.
1 change: 0 additions & 1 deletion CMakeLists.txt
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,6 @@ find_package(Boost 1.4 REQUIRED COMPONENTS program_options filesystem regex date
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
include_directories(SYSTEM ${Boost_INCLUDE_DIR})
include_directories(${CMAKE_BINARY_DIR}/thirdparty/htslib/include)
include_directories(thirdparty/graph-tools-GT-506/include)

add_subdirectory(thirdparty/graph-tools-master)

Expand Down
Empty file modified COPYRIGHT.txt
100755 → 100644
Empty file.
Empty file modified LICENSE.txt
100755 → 100644
Empty file.
Empty file modified README.md
100755 → 100644
Empty file.
26 changes: 26 additions & 0 deletions alignment/AlignmentFilters.cpp
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@

using graphtools::GraphAlignment;
using graphtools::NodeId;
using graphtools::Operation;
using graphtools::OperationType;
using graphtools::Path;
using std::list;
using std::string;
Expand Down Expand Up @@ -110,4 +112,28 @@ bool checkIfDownstreamAlignmentIsGood(NodeId nodeId, GraphAlignment alignment)
return score >= kScoreCutoff;
}

bool checkIfPassesAlignmentFilters(const GraphAlignment& alignment)
{
const Operation& firstOperation = alignment.alignments().front().operations().front();
const int frontSoftclipLen = firstOperation.type() == OperationType::kSoftclip ? firstOperation.queryLength() : 0;

const Operation& lastOperation = alignment.alignments().back().operations().back();
const int backSoftclipLen = lastOperation.type() == OperationType::kSoftclip ? lastOperation.queryLength() : 0;

const int clippedQueryLength = alignment.queryLength() - frontSoftclipLen - backSoftclipLen;
const int referenceLength = alignment.referenceLength();

const int percentQueryMatches = (100 * alignment.numMatches()) / clippedQueryLength;
const int percentReferenceMatches = (100 * alignment.numMatches()) / referenceLength;

if (percentQueryMatches >= 80 && percentReferenceMatches >= 80)
{
return true;
}
else
{
return false;
}
}

}
2 changes: 2 additions & 0 deletions alignment/AlignmentFilters.hh
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -50,4 +50,6 @@ bool checkIfUpstreamAlignmentIsGood(graphtools::NodeId nodeId, graphtools::Graph
// Checks if alignment downstream of a given node is high quality
bool checkIfDownstreamAlignmentIsGood(graphtools::NodeId nodeId, graphtools::GraphAlignment alignment);

bool checkIfPassesAlignmentFilters(const graphtools::GraphAlignment& alignment);

}
Empty file modified alignment/AlignmentTweakers.cpp
100755 → 100644
Empty file.
Empty file modified alignment/AlignmentTweakers.hh
100755 → 100644
Empty file.
Empty file modified alignment/CMakeLists.txt
100755 → 100644
Empty file.
Empty file modified alignment/GraphAlignmentOperations.cpp
100755 → 100644
Empty file.
Empty file modified alignment/GraphAlignmentOperations.hh
100755 → 100644
Empty file.
Empty file modified alignment/GreedyAlignmentIntersector.cpp
100755 → 100644
Empty file.
Empty file modified alignment/GreedyAlignmentIntersector.hh
100755 → 100644
Empty file.
Empty file modified alignment/HighQualityBaseRunFinder.cpp
100755 → 100644
Empty file.
Empty file modified alignment/HighQualityBaseRunFinder.hh
100755 → 100644
Empty file.
Empty file modified alignment/SoftclippingAligner.cpp
100755 → 100644
Empty file.
Empty file modified alignment/SoftclippingAligner.hh
100755 → 100644
Empty file.
Empty file modified alignment/tests/AlignmentTweakersTest.cpp
100755 → 100644
Empty file.
Empty file modified alignment/tests/CMakeLists.txt
100755 → 100644
Empty file.
Empty file modified alignment/tests/GraphAlignmentOperationsTest.cpp
100755 → 100644
Empty file.
Empty file modified alignment/tests/GreedyAlignmentIntersectorTest.cpp
100755 → 100644
Empty file.
Empty file modified alignment/tests/HighQualityBaseRunFinderTest.cpp
100755 → 100644
Empty file.
Empty file modified alignment/tests/SoftclippingAlignerTest.cpp
100755 → 100644
Empty file.
Empty file modified classification/AlignmentClassifier.cpp
100755 → 100644
Empty file.
Empty file modified classification/AlignmentClassifier.hh
100755 → 100644
Empty file.
Empty file modified classification/CMakeLists.txt
100755 → 100644
Empty file.
Empty file modified classification/ClassifierOfAlignmentsToVariant.cpp
100755 → 100644
Empty file.
Empty file modified classification/ClassifierOfAlignmentsToVariant.hh
100755 → 100644
Empty file.
Empty file modified classification/tests/AlignmentClassifierTest.cpp
100755 → 100644
Empty file.
1 change: 0 additions & 1 deletion classification/tests/AlignmentSummaryTest.cpp
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@

using namespace ehunter;

using reads::Read;
using std::map;
using std::vector;

Expand Down
Empty file modified classification/tests/CMakeLists.txt
100755 → 100644
Empty file.
Empty file modified classification/tests/ClassifierOfAlignmentsToVariantTest.cpp
100755 → 100644
Empty file.
Empty file modified cmake/google_test.cmake
100755 → 100644
Empty file.
Empty file modified common/CMakeLists.txt
100755 → 100644
Empty file.
Empty file modified common/Common.cpp
100755 → 100644
Empty file.
Empty file modified common/Common.hh
100755 → 100644
Empty file.
Empty file modified common/CountTable.cpp
100755 → 100644
Empty file.
Empty file modified common/CountTable.hh
100755 → 100644
Empty file.
99 changes: 48 additions & 51 deletions common/GenomicRegion.cpp
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -22,55 +22,34 @@

#include "common/GenomicRegion.hh"

#include <boost/algorithm/string/classification.hpp>
#include <boost/algorithm/string/split.hpp>
#include <boost/lexical_cast.hpp>

#include <limits>
#include <sstream>
#include <stdexcept>
#include <string>
#include <vector>

#include <boost/algorithm/string/classification.hpp>
#include <boost/algorithm/string/split.hpp>

using std::istream;
using std::ostream;
using std::string;
using std::unordered_map;
using std::vector;

using boost::lexical_cast;
using boost::algorithm::is_any_of;
using boost::algorithm::split;

namespace ehunter
{

Region::Region(const std::string chrom, int64_t start, int64_t end)
: chrom_(chrom)
GenomicRegion::GenomicRegion(int32_t contigIndex, int64_t start, int64_t end)
: contigIndex_(contigIndex)
, start_(start)
, end_(end)
{
}

Region::Region(const std::string encoding)
{
vector<string> components;
boost::algorithm::split(components, encoding, is_any_of(":-"));

if (components.size() != 3)
{
throw std::logic_error("Unexpected range format: " + encoding);
}

chrom_ = components[0];
start_ = lexical_cast<int64_t>(components[1]);
end_ = lexical_cast<int64_t>(components[2]);
}

bool Region::operator<(const Region& other) const
bool GenomicRegion::operator<(const GenomicRegion& other) const
{
if (chrom_ != other.chrom_)
if (contigIndex_ != other.contigIndex_)
{
return chrom_ < other.chrom_;
return contigIndex_ < other.contigIndex_;
}

if (start_ != other.start_)
Expand All @@ -81,9 +60,9 @@ bool Region::operator<(const Region& other) const
return end_ < other.end_;
}

bool Region::Overlaps(const Region& other) const
bool GenomicRegion::overlaps(const GenomicRegion& other) const
{
if (chrom_ != other.chrom_)
if (contigIndex_ != other.contigIndex_)
{
return false;
}
Expand All @@ -94,9 +73,9 @@ bool Region::Overlaps(const Region& other) const
return leftBound <= rightBound;
}

int64_t Region::Distance(const Region& other) const
int64_t GenomicRegion::distance(const GenomicRegion& other) const
{
if (chrom_ != other.chrom_)
if (contigIndex_ != other.contigIndex_)
{
return std::numeric_limits<int64_t>::max();
}
Expand All @@ -114,7 +93,7 @@ int64_t Region::Distance(const Region& other) const
return 0;
}

vector<Region> merge(vector<Region> regions, int maxMergeDist)
vector<GenomicRegion> merge(vector<GenomicRegion> regions, int maxMergeDist)
{
if (regions.empty())
{
Expand All @@ -123,12 +102,12 @@ vector<Region> merge(vector<Region> regions, int maxMergeDist)

std::sort(regions.begin(), regions.end());

Region mergedRegion = regions.front();
vector<Region> mergedRegions;
GenomicRegion mergedRegion = regions.front();
vector<GenomicRegion> mergedRegions;

for (const auto& currentRegion : regions)
{
if (currentRegion.Distance(mergedRegion) <= maxMergeDist)
if (currentRegion.distance(mergedRegion) <= maxMergeDist)
{
const int64_t furthestEnd = std::max<int64_t>(mergedRegion.end(), currentRegion.end());
mergedRegion.setEnd(furthestEnd);
Expand All @@ -148,27 +127,45 @@ vector<Region> merge(vector<Region> regions, int maxMergeDist)
return mergedRegions;
}

const string Region::ToString() const
{
std::ostringstream out;
out << *this;
return out.str();
}

// Returns the range extended by flankSize upstream and downstream.
// NOTE: The right boundary of the extended region may stick past chromosome
// end.
Region Region::extend(int length) const
GenomicRegion GenomicRegion::extend(int length) const
{
const int64_t new_start = start_ >= length ? (start_ - length) : 0;
const int64_t new_end = end_ + length;
return Region(chrom_, new_start, new_end);
const int64_t newStart = start_ >= length ? (start_ - length) : 0;
const int64_t newEnd = end_ + length;
return GenomicRegion(contigIndex_, newStart, newEnd);
}

std::ostream& operator<<(std::ostream& out, const Region& region)
std::ostream& operator<<(std::ostream& out, const GenomicRegion& region)
{
out << region.chrom_ << ":" << region.start_ << "-" << region.end_;
out << "(" << region.contigIndex_ << "):" << region.start_ << "-" << region.end_;
return out;
}

string encode(const ReferenceContigInfo& contigInfo, const GenomicRegion& region)
{
const auto& contigName = contigInfo.getContigName(region.contigIndex());
return contigName + ":" + std::to_string(region.start()) + "-" + std::to_string(region.end());
}

GenomicRegion decode(const ReferenceContigInfo& contigInfo, const string& encoding)
{
vector<string> components;
boost::algorithm::split(components, encoding, boost::algorithm::is_any_of(":-"));

if (components.size() != 3)
{
throw std::logic_error("Unexpected range format: " + encoding);
}

const auto& contigName = components[0];
int32_t contigIndex = contigInfo.getContigId(contigName);

int64_t start = std::stoi(components[1]);
int64_t end = std::stoi(components[2]);

return GenomicRegion(contigIndex, start, end);
}

}
45 changes: 27 additions & 18 deletions common/GenomicRegion.hh
100755 → 100644
Original file line number Diff line number Diff line change
Expand Up @@ -24,48 +24,57 @@

#include <iostream>
#include <string>
#include <unordered_map>
#include <vector>

#include "common/ReferenceContigInfo.hh"

namespace ehunter
{

class Region
// Represents a contiguous region of a genome using 0-based half-open coordinates
class GenomicRegion
{
public:
friend std::ostream& operator<<(std::ostream& out, const Region& region);
friend std::ostream& operator<<(std::ostream& out, const GenomicRegion& region);

Region(const std::string chrom, int64_t start, int64_t end);
explicit Region(const std::string encoding);
GenomicRegion(const int32_t contigIndex, int64_t start, int64_t end);

bool operator<(const Region& other) const;
bool operator<(const GenomicRegion& other) const;

bool Overlaps(const Region& other) const;
int64_t Distance(const Region& other) const;
bool overlaps(const GenomicRegion& other) const;
int64_t distance(const GenomicRegion& other) const;

const std::string& chrom() const { return chrom_; }
int32_t contigIndex() const { return contigIndex_; }
int64_t start() const { return start_; }
int64_t end() const { return end_; }
int64_t length() const { return end_ - start_ + 1; }
int64_t length() const { return end_ - start_; }

void setChrom(const std::string& chrom) { chrom_ = chrom; }
void setContigId(int32_t contigIndex) { contigIndex_ = contigIndex; }
void setStart(int64_t start) { start_ = start; }
void setEnd(int64_t end) { end_ = end; }
bool operator==(const Region& other) const

bool operator==(const GenomicRegion& other) const
{
return chrom_ == other.chrom_ && start_ == other.start_ && end_ == other.end_;
return contigIndex_ == other.contigIndex_ && start_ == other.start_ && end_ == other.end_;
}
bool operator!=(const Region& other) const { return !(*this == other); }

Region extend(int length) const;
const std::string ToString() const;
bool operator!=(const GenomicRegion& other) const { return !(*this == other); }

GenomicRegion extend(int length) const;

private:
std::string chrom_;
int32_t contigIndex_;
int64_t start_;
int64_t end_;
};

std::vector<Region> merge(std::vector<Region> regions, int maxMergeDist = 500);
std::ostream& operator<<(std::ostream& out, const Region& region);
using GenomicRegionCatalog = std::unordered_map<std::string, GenomicRegion>;

std::ostream& operator<<(std::ostream& out, const GenomicRegion& region);
std::vector<GenomicRegion> merge(std::vector<GenomicRegion> regions, int maxMergeDist = 500);

std::string encode(const ReferenceContigInfo& contigInfo, const GenomicRegion& region);
GenomicRegion decode(const ReferenceContigInfo& contigInfo, const std::string& encoding);

}
Loading

0 comments on commit d5fde05

Please sign in to comment.