Skip to content

Commit

Permalink
Merge pull request #30 from Illumina/release-2.5.4
Browse files Browse the repository at this point in the history
ExpansionHunter v2.5.4
  • Loading branch information
egor-dolzhenko authored Feb 2, 2018
2 parents 93a8fb8 + 745b294 commit a652f8e
Show file tree
Hide file tree
Showing 8 changed files with 68 additions and 49 deletions.
1 change: 1 addition & 0 deletions .clang-format
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
BasedOnStyle: Google
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,4 @@
cmake-build-debug
test-example
build
.vscode
19 changes: 19 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
language: cpp

addons:
apt:
sources:
- boost-latest
- ubuntu-toolchain-r-test
packages:
- g++-4.9
- libboost1.55-all-dev

script:
- export CC="gcc-4.9"
- export CXX="g++-4.9"
- mkdir build
- cd build
- cmake ..
- make -j4
- make test
8 changes: 4 additions & 4 deletions common/parameters.cc
Original file line number Diff line number Diff line change
Expand Up @@ -36,12 +36,12 @@
#include <boost/lexical_cast.hpp>
#include <boost/program_options.hpp>

using boost::lexical_cast;
using boost::format;
using boost::assign::list_of;
using std::string;
using boost::format;
using boost::lexical_cast;
using std::cerr;
using std::endl;
using std::string;
using std::vector;

namespace po = boost::program_options;
Expand Down Expand Up @@ -119,7 +119,7 @@ bool Parameters::Load(int argc, char **argv) {

if (argc == 1) {
std::cerr << usage << std::endl;
return false;
throw std::invalid_argument("");
}

po::variables_map arg_map;
Expand Down
16 changes: 10 additions & 6 deletions common/parameters.h
Original file line number Diff line number Diff line change
Expand Up @@ -31,25 +31,29 @@
#include "genotyping/short_repeat_genotyper.h"

class Outputs {
public:
public:
Outputs(const std::string vcf_path, const std::string json_path,
const std::string log_path);
std::ostream &vcf() { return vcf_; }
std::ostream &json() { return json_; }
std::ostream &log() { return log_; }

private:
private:
std::ofstream vcf_;
std::ofstream json_;
std::ofstream log_;
};

class Parameters {
public:
public:
const double kSmallestPossibleDepth = 5.0;
Parameters()
: region_extension_len_(1000), min_wp_(0.90), min_baseq_(20),
min_anchor_mapq_(60), skip_unaligned_(false), depth_(0.0),
: region_extension_len_(1000),
min_wp_(0.90),
min_baseq_(20),
min_anchor_mapq_(60),
skip_unaligned_(false),
depth_(0.0),
sex_(Sex::kFemale) {}
bool Load(int argc, char **argv);
std::string bam_path() const { return bam_path_; }
Expand All @@ -73,7 +77,7 @@ class Parameters {
bool depth_is_set() const { return depth_ >= kSmallestPossibleDepth; }
Sex sex() const { return sex_; }

private:
private:
std::string bam_path_;
std::string genome_path_;
// Maximum distance from a region to search for relevant reads.
Expand Down
2 changes: 1 addition & 1 deletion include/version.h
Original file line number Diff line number Diff line change
Expand Up @@ -24,4 +24,4 @@

#include <string>

const std::string kProgramVersion = "Expansion Hunter v2.5.3";
const std::string kProgramVersion = "Expansion Hunter v2.5.4";
40 changes: 18 additions & 22 deletions src/expansion_hunter.cc
Original file line number Diff line number Diff line change
Expand Up @@ -51,14 +51,14 @@
#include "purity/purity.h"
#include "rep_align/rep_align.h"

using std::unordered_set;
using std::map;
using std::vector;
using std::string;
using std::array;
using std::cerr;
using std::endl;
using std::map;
using std::pair;
using std::array;
using std::string;
using std::unordered_set;
using std::vector;

// Returns the length of the first read in a BAM file.
size_t CalcReadLen(const string &bam_path) {
Expand Down Expand Up @@ -189,14 +189,10 @@ void CacheAligns(BamFile *bam_file, const RepeatSpec &repeat_spec,
&align_pairs);
}

// Filling-in missing mates by jumping around the BAM.
// Filling-in missing mates by jumping around the BAM/CRAM file.
cerr << TimeStamp() << ",\t[Filling in mates]" << endl;
if ((*bam_file).format() == BamFile::kBamFile) {
FillinMates(*bam_file, align_pairs, repeat_spec.units_shifts, 0.9,
ontarget_frag_names);
} else {
cerr << TimeStamp() << ",\t[Skipping filling in mates]" << endl;
}
FillinMates(*bam_file, align_pairs, repeat_spec.units_shifts, 0.9,
ontarget_frag_names);
cerr << TimeStamp() << ",\t[Done filling in mates]" << endl;
}

Expand Down Expand Up @@ -385,22 +381,19 @@ void EstimateRepeatSizes(const Parameters &parameters,
haplotype_candidates.push_back(
RepeatAllele(read_group.size, read_group.num_supporting_reads,
ReadType::kSpanning));

} else if (read_group.read_type == ReadType::kInrepeat) {
flanking_size_counts[num_units_in_read] +=
read_group.num_supporting_reads;
haplotype_candidates.push_back(
RepeatAllele(num_units_in_read, read_group.num_supporting_reads,
ReadType::kInrepeat));

} else if (read_group.read_type == ReadType::kFlanking) {
haplotype_candidates.push_back(
RepeatAllele(read_group.size, read_group.num_supporting_reads,
ReadType::kFlanking));
for (const auto &align : read_group.rep_aligns) {
flanking_size_counts[align.size] += 1;
}

} else {
throw std::logic_error("Do not know how to deal with " +
kReadTypeToString.at(read_group.read_type) +
Expand All @@ -427,9 +420,10 @@ void EstimateRepeatSizes(const Parameters &parameters,
cerr << "]" << endl;

if (haplotype_candidates.empty()) {
cerr << TimeStamp()
<< ",\t[Skipping this region because no informative reads were found]"
<< endl;
cerr
<< TimeStamp()
<< ",\t[Skipping this region because no informative reads were found]"
<< endl;
continue;
}

Expand Down Expand Up @@ -468,12 +462,13 @@ int main(int argc, char *argv[]) {
cerr << kProgramVersion << endl;

if (!parameters.Load(argc, argv)) {
return 1;
return 0;
}

cerr << TimeStamp() << ",[Starting Logging for " << parameters.sample_name() << "]" << endl;
cerr << TimeStamp() << ",[Starting Logging for " << parameters.sample_name()
<< "]" << endl;

Outputs outputs(parameters.vcf_path(), parameters.json_path(),
Outputs outputs(parameters.vcf_path(), parameters.json_path(),
parameters.log_path());

map<string, RepeatSpec> repeat_specs;
Expand All @@ -498,7 +493,8 @@ int main(int argc, char *argv[]) {
parameters.set_depth(depth);
}

cerr << TimeStamp() << ",[Read length: " << parameters.read_len() << "]" << endl;
cerr << TimeStamp() << ",[Read length: " << parameters.read_len() << "]"
<< endl;
cerr << TimeStamp() << ",[Depth: " << parameters.depth() << "]" << endl;

if (parameters.depth() < parameters.kSmallestPossibleDepth) {
Expand Down
30 changes: 14 additions & 16 deletions src/vcf_output.cc
Original file line number Diff line number Diff line change
Expand Up @@ -31,18 +31,17 @@

#include <boost/algorithm/string/join.hpp>

using std::vector;
using std::ostream;
using std::string;
using std::map;
using std::cerr;
using std::endl;
using std::map;
using std::ostream;
using std::string;
using std::vector;

void WriteVcf(const Parameters &parameters,
const std::map<std::string, RepeatSpec> &repeat_specs,
const std::vector<RegionFindings> &sample_findings,
std::ostream &out) {

std::stringstream vcf_header, vcf_body;
// clang-format off
vcf_header <<
Expand All @@ -55,11 +54,11 @@ void WriteVcf(const Parameters &parameters,
"##INFO=<ID=REPID,Number=1,Type=String,Description=\"Repeat identifier from the repeat specification file\">\n"
"##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n"
"##FORMAT=<ID=SO,Number=1,Type=String,Description=\"Type of reads that support the allele; can be SPANNING, FLANKING, or INREPEAT meaning that the reads span, flank, or are fully contained in the repeat\">\n"
"##FORMAT=<ID=CN,Number=1,Type=String,Description=\"Allele copy number\">\n"
"##FORMAT=<ID=CI,Number=1,Type=String,Description=\"Confidence interval for CN\">\n"
"##FORMAT=<ID=AD_FL,Number=1,Type=String,Description=\"Number of flanking reads consistent with the allele\">\n"
"##FORMAT=<ID=AD_SP,Number=1,Type=String,Description=\"Number of spanning reads consistent with the allele\">\n"
"##FORMAT=<ID=AD_IR,Number=1,Type=String,Description=\"Number of in-repeat reads consistent with the allele\">\n";
"##FORMAT=<ID=REPCN,Number=1,Type=String,Description=\"Number of repeat units spanned by the allele\">\n"
"##FORMAT=<ID=REPCI,Number=1,Type=String,Description=\"Confidence interval for REPCN\">\n"
"##FORMAT=<ID=ADFL,Number=1,Type=String,Description=\"Number of flanking reads consistent with the allele\">\n"
"##FORMAT=<ID=ADSP,Number=1,Type=String,Description=\"Number of spanning reads consistent with the allele\">\n"
"##FORMAT=<ID=ADIR,Number=1,Type=String,Description=\"Number of in-repeat reads consistent with the allele\">\n";
// clang-format on

std::set<int> alt_sizes;
Expand All @@ -81,7 +80,6 @@ void WriteVcf(const Parameters &parameters,
sample_ad_fl, sample_ad_ir;

for (const RepeatAllele allele : region_findings.genotype) {

const int allele_len = allele.size_ * unit_len;
const string source = kReadTypeToString.at(allele.type_);

Expand Down Expand Up @@ -141,17 +139,17 @@ void WriteVcf(const Parameters &parameters,
}
}
const Region &region = repeat_spec.target_region;
const string info = "SVTYPE=STR;END=" + std::to_string(region.end()) +
";REF=" + std::to_string(reference_size) + ";RL=" +
std::to_string(reference_size * unit_len) + ";RU=" +
unit_encoding + ";REPID=" + region_id;
const string info = "END=" + std::to_string(region.end()) +
";REF=" + std::to_string(reference_size) +
";RL=" + std::to_string(reference_size * unit_len) +
";RU=" + unit_encoding + ";REPID=" + region_id;
if (alt.empty()) {
alt = ".";
}

vcf_body << region.chrom() << "\t" << region.start() - 1 << "\t.\t"
<< ref_field << "\t" << alt << "\t.\tPASS\t" << info
<< "\tGT:SO:CN:CI:AD_SP:AD_FL:AD_IR\t" << sample_gt << ":"
<< "\tGT:SO:REPCN:REPCI:ADSP:ADFL:ADIR\t" << sample_gt << ":"
<< sample_so << ":" << sample_cn << ":" << sample_ci << ":"
<< sample_ad_sp << ":" << sample_ad_fl << ":" << sample_ad_ir
<< endl;
Expand Down

0 comments on commit a652f8e

Please sign in to comment.