Skip to content

Commit

Permalink
Merge branch 'master' of github.com:pangenome/odgi
Browse files Browse the repository at this point in the history
  • Loading branch information
subwaystation committed Aug 28, 2023
2 parents f700063 + c4cc692 commit 3cf6705
Show file tree
Hide file tree
Showing 3 changed files with 193 additions and 20 deletions.
7 changes: 4 additions & 3 deletions src/algorithms/subgraph/extract.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,10 @@ namespace odgi {
}

path_handle_t create_subpath(graph_t &subgraph, const string &subpath_name, const bool is_circular) {
if (subgraph.has_path(subpath_name)) {
subgraph.destroy_path(subgraph.get_path_handle(subpath_name));
}
// The function assumes that every path is new and unique
// if (subgraph.has_path(subpath_name)) {
// subgraph.destroy_path(subgraph.get_path_handle(subpath_name));
// }
return subgraph.create_path_handle(subpath_name, is_circular);
};

Expand Down
54 changes: 38 additions & 16 deletions src/subcommand/extract_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -267,25 +267,46 @@ namespace odgi {

std::vector<odgi::path_range_t> input_path_ranges;

// handle targets from BED
if (_path_bed_file && !args::get(_path_bed_file).empty()) {
std::ifstream bed_in(args::get(_path_bed_file));
std::string line;
while (std::getline(bed_in, line)) {
add_bed_range(input_path_ranges, graph, line);
{
// handle targets from BED
if (_path_bed_file && !args::get(_path_bed_file).empty()) {
std::ifstream bed_in(args::get(_path_bed_file));
std::string line;
while (std::getline(bed_in, line)) {
add_bed_range(input_path_ranges, graph, line);
}
}
}

// handle targets from command line
if (_path_range) {
Region region;
parse_region(args::get(_path_range), region);
// handle targets from command line
if (_path_range) {
Region region;
parse_region(args::get(_path_range), region);

// no coordinates given, we do whole thing (0,-1)
if (region.start < 0 || region.end < 0) {
add_bed_range(input_path_ranges, graph, region.seq);
} else {
add_bed_range(input_path_ranges, graph, region.seq + "\t" + std::to_string(region.start) + "\t" + std::to_string(region.end));
// no coordinates given, we do whole thing (0,-1)
if (region.start < 0 || region.end < 0) {
add_bed_range(input_path_ranges, graph, region.seq);
} else {
add_bed_range(input_path_ranges, graph, region.seq + "\t" + std::to_string(region.start) + "\t" + std::to_string(region.end));
}
}

// Check duplicates
std::vector<path_range_t> copy_ranges = input_path_ranges; // Create a copy of the vector to avoid sorting the original one

auto compare_path_range = [](const path_range_t& a, const path_range_t& b) -> bool {
if (a.begin.path != b.begin.path) return a.begin.path < b.begin.path;
if (a.begin.offset != b.begin.offset) return a.begin.offset < b.begin.offset;
if (a.end.path != b.end.path) return a.end.path < b.end.path;
return a.end.offset < b.end.offset;
}; // Lambda function to compare two path_range_t objects

std::sort(copy_ranges.begin(), copy_ranges.end(), compare_path_range); // Sort the copied vector using the lambda function

for (size_t i = 1; i < copy_ranges.size(); i++) {
if (!compare_path_range(copy_ranges[i-1], copy_ranges[i])) {
std::cerr << "[odgi::extract] error: " << graph.get_path_name(copy_ranges[i].begin.path) << ":" << copy_ranges[i].begin.offset << "-" << copy_ranges[i].end.offset << " is a duplicated path range" << std::endl;
return 1;
}
}
}

Expand Down Expand Up @@ -610,6 +631,7 @@ namespace odgi {
const std::string path_name = source.get_path_name(path_range.begin.path);

subpaths_from_path_ranges.push_back(
// The function assumes that every path is new and unique
odgi::algorithms::create_subpath(
subgraph,
odgi::algorithms::make_path_name(path_name, path_range.begin.offset, path_range.end.offset),
Expand Down
152 changes: 151 additions & 1 deletion src/subcommand/paths_main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,11 @@ int main_paths(int argc, char** argv) {
{'D', "delim"});
args::ValueFlag<std::uint16_t> path_delim_pos(path_investigation_opts, "N", "Consider the N-th occurrence of the delimiter specified with **-D, --delim**"
" to obtain the group identifier. Specify 1 for the 1st occurrence (default).",
{'p', "delim-pos"});
{'p', "delim-pos"});
args::ValueFlag<std::string> non_reference_nodes(path_investigation_opts, "FILE", "Print to stdout IDs of nodes that are not in the paths listed (by line) in *FILE*.", {"non-reference-nodes"});
args::ValueFlag<std::string> non_reference_ranges(path_investigation_opts, "FILE", "Print to stdout (in BED format) path ranges that are not in the paths listed (by line) in *FILE*.", {"non-reference-ranges"});
args::ValueFlag<uint64_t> min_size(path_investigation_opts, "N", "Minimum size (in bps) of nodes (with --non-reference-nodes) or ranges (with --non-reference-ranges).", {"min-size"});
args::Flag show_step_ranges(path_investigation_opts, "N", "Show steps (that is, node IDs and strands) of --non-reference-ranges.", {"show-step-ranges"});
args::Group path_modification_opts(parser, "[ Path Modification Options ]");
args::ValueFlag<std::string> keep_paths_file(path_modification_opts, "FILE", "Keep paths listed (by line) in *FILE*.", {'K', "keep-paths"});
args::ValueFlag<std::string> drop_paths_file(path_modification_opts, "FILE", "Drop paths listed (by line) in *FILE*.", {'X', "drop-paths"});
Expand Down Expand Up @@ -92,6 +96,11 @@ int main_paths(int argc, char** argv) {
return 1;
}

if (non_reference_nodes && non_reference_ranges) {
std::cerr << "[odgi::paths] error: specify --non-reference-nodes or --non-reference-ranges, not both." << std::endl;
return 1;
}

const uint64_t num_threads = args::get(threads) ? args::get(threads) : 1;
omp_set_num_threads(num_threads);

Expand Down Expand Up @@ -347,6 +356,147 @@ int main_paths(int argc, char** argv) {
}
}

if (
(non_reference_nodes && !args::get(non_reference_nodes).empty()) ||
(non_reference_ranges && !args::get(non_reference_ranges).empty())
) {
const uint64_t min_size_in_bp = min_size ? args::get(min_size) : 0;

// Check if the node IDs are compacted
const uint64_t shift = graph.min_node_id();
if (graph.max_node_id() - shift >= graph.get_node_count()){
std::cerr << "[odgi::paths] error: the node IDs are not compacted. Please run 'odgi sort' using -O, --optimize to optimize the graph." << std::endl;
exit(1);
}

// Read paths to use as reference paths
std::vector<path_handle_t> reference_paths;
std::string line;
auto& x = non_reference_nodes && !args::get(non_reference_nodes).empty() ? args::get(non_reference_nodes) : args::get(non_reference_ranges);
std::ifstream infile(x);
while (std::getline(infile, line)) {
// This file should contain path names, one per line
auto& name = line;
if (graph.has_path(name)) {
reference_paths.push_back(graph.get_path_handle(name));
} else {
std::cerr << "[odgi::paths] error: path'" << name
<< "' does not exist in graph." << std::endl;
return 1;
}
}

if (non_reference_nodes && !args::get(non_reference_nodes).empty()){
// Emit non-reference nodes

// Set non-reference nodes
atomicbitvector::atomic_bv_t non_reference_nodes(graph.get_node_count());
for(uint64_t x = 0; x < non_reference_nodes.size(); ++x) {
if (min_size_in_bp == 0 || graph.get_length(graph.get_handle(x + shift)) >= min_size_in_bp) {
non_reference_nodes.set(x);
}
}
#pragma omp parallel for schedule(dynamic,1)
for (auto &path : reference_paths) {
graph.for_each_step_in_path(path, [&](const step_handle_t& step) {
const handle_t handle = graph.get_handle_of_step(step);
non_reference_nodes.reset(graph.get_id(handle) - shift);
});
}

// Emit non-reference nodes
std::cout << "#node.id\tpaths" << std::endl;
for (auto x : non_reference_nodes) {
const handle_t handle = graph.get_handle(x + shift);

// Check paths that go through this node, if any
std::unordered_set<path_handle_t> unique_path_handles;
graph.for_each_step_on_handle(handle, [&](const step_handle_t& step) {
unique_path_handles.insert(graph.get_path_handle_of_step(step));
});
std::string result;
for (const auto& path : unique_path_handles) {
if (!result.empty()) {
result += ",";
}
result += graph.get_path_name(path);
}

std::cout << graph.get_id(handle) << "\t" << result << std::endl;
}
} else {
// Emit non-reference ranges

const bool _show_step_ranges = args::get(show_step_ranges);

// Set the reference nodes
atomicbitvector::atomic_bv_t reference_nodes(graph.get_node_count());
#pragma omp parallel for schedule(dynamic,1)
for (auto &path : reference_paths) {
graph.for_each_step_in_path(path, [&](const step_handle_t& step) {
const handle_t handle = graph.get_handle_of_step(step);
reference_nodes.set(graph.get_id(handle) - shift);
});
}

// Prepare non reference path handles for parallel processing
std::vector<path_handle_t> non_reference_paths;
graph.for_each_path_handle([&non_reference_paths](const path_handle_t& path) {
non_reference_paths.push_back(path);
});
std::sort(non_reference_paths.begin(), non_reference_paths.end());
std::sort(reference_paths.begin(), reference_paths.end());
non_reference_paths.erase(
std::remove_if(non_reference_paths.begin(), non_reference_paths.end(),
[&reference_paths](const auto &x) {
return std::binary_search(reference_paths.begin(), reference_paths.end(), x);
}), non_reference_paths.end());

// Traverse non reference paths to emit non-reference ranges
if (_show_step_ranges) {
std::cout << "#path.name\tstart\tend\tsteps" << std::endl;
} else {
std::cout << "#path.name\tstart\tend" << std::endl;
}
#pragma omp parallel for schedule(dynamic, 1)
for (auto& path : non_reference_paths) {
uint64_t start = 0, end = 0;
std::vector<step_handle_t> step_range;
graph.for_each_step_in_path(path, [&](const step_handle_t& step) {
const handle_t handle = graph.get_handle_of_step(step);
const uint64_t index = graph.get_id(handle) - shift;
if (reference_nodes.test(index)) {
// Emit the previous non reference range, if any
if (end > start && (end - start) >= min_size_in_bp) {
if (_show_step_ranges) {
std::string step_range_str = "";
for (auto& step : step_range) {
const handle_t handle = graph.get_handle_of_step(step);
step_range_str += std::to_string(graph.get_id(handle)) + (graph.get_is_reverse(handle) ? "-" : "+") + ",";
}
#pragma omp critical (cout)
std::cout << graph.get_path_name(path) << "\t" << start << "\t" << end << "\t" << step_range_str.substr(0, step_range_str.size() - 1) << std::endl; // trim the trailing comma from step_range
} else {
#pragma omp critical (cout)
std::cout << graph.get_path_name(path) << "\t" << start << "\t" << end << std::endl;
}
}
end += graph.get_length(handle);
start = end;
if (_show_step_ranges) {
step_range.clear();
}
} else {
end += graph.get_length(handle);
}
if (_show_step_ranges) {
step_range.push_back(step);
}
});
}
}
}

return 0;
}

Expand Down

0 comments on commit 3cf6705

Please sign in to comment.