diff --git a/src/algorithms/subgraph/extract.cpp b/src/algorithms/subgraph/extract.cpp index c570068f..2818f6ae 100644 --- a/src/algorithms/subgraph/extract.cpp +++ b/src/algorithms/subgraph/extract.cpp @@ -53,9 +53,10 @@ namespace odgi { } path_handle_t create_subpath(graph_t &subgraph, const string &subpath_name, const bool is_circular) { - if (subgraph.has_path(subpath_name)) { - subgraph.destroy_path(subgraph.get_path_handle(subpath_name)); - } + // The function assumes that every path is new and unique + // if (subgraph.has_path(subpath_name)) { + // subgraph.destroy_path(subgraph.get_path_handle(subpath_name)); + // } return subgraph.create_path_handle(subpath_name, is_circular); }; diff --git a/src/subcommand/extract_main.cpp b/src/subcommand/extract_main.cpp index 9a7da9a5..a81829f0 100644 --- a/src/subcommand/extract_main.cpp +++ b/src/subcommand/extract_main.cpp @@ -267,25 +267,46 @@ namespace odgi { std::vector input_path_ranges; - // handle targets from BED - if (_path_bed_file && !args::get(_path_bed_file).empty()) { - std::ifstream bed_in(args::get(_path_bed_file)); - std::string line; - while (std::getline(bed_in, line)) { - add_bed_range(input_path_ranges, graph, line); + { + // handle targets from BED + if (_path_bed_file && !args::get(_path_bed_file).empty()) { + std::ifstream bed_in(args::get(_path_bed_file)); + std::string line; + while (std::getline(bed_in, line)) { + add_bed_range(input_path_ranges, graph, line); + } } - } - // handle targets from command line - if (_path_range) { - Region region; - parse_region(args::get(_path_range), region); + // handle targets from command line + if (_path_range) { + Region region; + parse_region(args::get(_path_range), region); - // no coordinates given, we do whole thing (0,-1) - if (region.start < 0 || region.end < 0) { - add_bed_range(input_path_ranges, graph, region.seq); - } else { - add_bed_range(input_path_ranges, graph, region.seq + "\t" + std::to_string(region.start) + "\t" + std::to_string(region.end)); + // no coordinates given, we do whole thing (0,-1) + if (region.start < 0 || region.end < 0) { + add_bed_range(input_path_ranges, graph, region.seq); + } else { + add_bed_range(input_path_ranges, graph, region.seq + "\t" + std::to_string(region.start) + "\t" + std::to_string(region.end)); + } + } + + // Check duplicates + std::vector copy_ranges = input_path_ranges; // Create a copy of the vector to avoid sorting the original one + + auto compare_path_range = [](const path_range_t& a, const path_range_t& b) -> bool { + if (a.begin.path != b.begin.path) return a.begin.path < b.begin.path; + if (a.begin.offset != b.begin.offset) return a.begin.offset < b.begin.offset; + if (a.end.path != b.end.path) return a.end.path < b.end.path; + return a.end.offset < b.end.offset; + }; // Lambda function to compare two path_range_t objects + + std::sort(copy_ranges.begin(), copy_ranges.end(), compare_path_range); // Sort the copied vector using the lambda function + + for (size_t i = 1; i < copy_ranges.size(); i++) { + if (!compare_path_range(copy_ranges[i-1], copy_ranges[i])) { + std::cerr << "[odgi::extract] error: " << graph.get_path_name(copy_ranges[i].begin.path) << ":" << copy_ranges[i].begin.offset << "-" << copy_ranges[i].end.offset << " is a duplicated path range" << std::endl; + return 1; + } } } @@ -610,6 +631,7 @@ namespace odgi { const std::string path_name = source.get_path_name(path_range.begin.path); subpaths_from_path_ranges.push_back( + // The function assumes that every path is new and unique odgi::algorithms::create_subpath( subgraph, odgi::algorithms::make_path_name(path_name, path_range.begin.offset, path_range.end.offset), diff --git a/src/subcommand/paths_main.cpp b/src/subcommand/paths_main.cpp index 63223b36..a8e0aa86 100644 --- a/src/subcommand/paths_main.cpp +++ b/src/subcommand/paths_main.cpp @@ -45,7 +45,11 @@ int main_paths(int argc, char** argv) { {'D', "delim"}); args::ValueFlag path_delim_pos(path_investigation_opts, "N", "Consider the N-th occurrence of the delimiter specified with **-D, --delim**" " to obtain the group identifier. Specify 1 for the 1st occurrence (default).", - {'p', "delim-pos"}); + {'p', "delim-pos"}); + args::ValueFlag non_reference_nodes(path_investigation_opts, "FILE", "Print to stdout IDs of nodes that are not in the paths listed (by line) in *FILE*.", {"non-reference-nodes"}); + args::ValueFlag non_reference_ranges(path_investigation_opts, "FILE", "Print to stdout (in BED format) path ranges that are not in the paths listed (by line) in *FILE*.", {"non-reference-ranges"}); + args::ValueFlag min_size(path_investigation_opts, "N", "Minimum size (in bps) of nodes (with --non-reference-nodes) or ranges (with --non-reference-ranges).", {"min-size"}); + args::Flag show_step_ranges(path_investigation_opts, "N", "Show steps (that is, node IDs and strands) of --non-reference-ranges.", {"show-step-ranges"}); args::Group path_modification_opts(parser, "[ Path Modification Options ]"); args::ValueFlag keep_paths_file(path_modification_opts, "FILE", "Keep paths listed (by line) in *FILE*.", {'K', "keep-paths"}); args::ValueFlag drop_paths_file(path_modification_opts, "FILE", "Drop paths listed (by line) in *FILE*.", {'X', "drop-paths"}); @@ -92,6 +96,11 @@ int main_paths(int argc, char** argv) { return 1; } + if (non_reference_nodes && non_reference_ranges) { + std::cerr << "[odgi::paths] error: specify --non-reference-nodes or --non-reference-ranges, not both." << std::endl; + return 1; + } + const uint64_t num_threads = args::get(threads) ? args::get(threads) : 1; omp_set_num_threads(num_threads); @@ -347,6 +356,147 @@ int main_paths(int argc, char** argv) { } } + if ( + (non_reference_nodes && !args::get(non_reference_nodes).empty()) || + (non_reference_ranges && !args::get(non_reference_ranges).empty()) + ) { + const uint64_t min_size_in_bp = min_size ? args::get(min_size) : 0; + + // Check if the node IDs are compacted + const uint64_t shift = graph.min_node_id(); + if (graph.max_node_id() - shift >= graph.get_node_count()){ + std::cerr << "[odgi::paths] error: the node IDs are not compacted. Please run 'odgi sort' using -O, --optimize to optimize the graph." << std::endl; + exit(1); + } + + // Read paths to use as reference paths + std::vector reference_paths; + std::string line; + auto& x = non_reference_nodes && !args::get(non_reference_nodes).empty() ? args::get(non_reference_nodes) : args::get(non_reference_ranges); + std::ifstream infile(x); + while (std::getline(infile, line)) { + // This file should contain path names, one per line + auto& name = line; + if (graph.has_path(name)) { + reference_paths.push_back(graph.get_path_handle(name)); + } else { + std::cerr << "[odgi::paths] error: path'" << name + << "' does not exist in graph." << std::endl; + return 1; + } + } + + if (non_reference_nodes && !args::get(non_reference_nodes).empty()){ + // Emit non-reference nodes + + // Set non-reference nodes + atomicbitvector::atomic_bv_t non_reference_nodes(graph.get_node_count()); + for(uint64_t x = 0; x < non_reference_nodes.size(); ++x) { + if (min_size_in_bp == 0 || graph.get_length(graph.get_handle(x + shift)) >= min_size_in_bp) { + non_reference_nodes.set(x); + } + } +#pragma omp parallel for schedule(dynamic,1) + for (auto &path : reference_paths) { + graph.for_each_step_in_path(path, [&](const step_handle_t& step) { + const handle_t handle = graph.get_handle_of_step(step); + non_reference_nodes.reset(graph.get_id(handle) - shift); + }); + } + + // Emit non-reference nodes + std::cout << "#node.id\tpaths" << std::endl; + for (auto x : non_reference_nodes) { + const handle_t handle = graph.get_handle(x + shift); + + // Check paths that go through this node, if any + std::unordered_set unique_path_handles; + graph.for_each_step_on_handle(handle, [&](const step_handle_t& step) { + unique_path_handles.insert(graph.get_path_handle_of_step(step)); + }); + std::string result; + for (const auto& path : unique_path_handles) { + if (!result.empty()) { + result += ","; + } + result += graph.get_path_name(path); + } + + std::cout << graph.get_id(handle) << "\t" << result << std::endl; + } + } else { + // Emit non-reference ranges + + const bool _show_step_ranges = args::get(show_step_ranges); + + // Set the reference nodes + atomicbitvector::atomic_bv_t reference_nodes(graph.get_node_count()); + #pragma omp parallel for schedule(dynamic,1) + for (auto &path : reference_paths) { + graph.for_each_step_in_path(path, [&](const step_handle_t& step) { + const handle_t handle = graph.get_handle_of_step(step); + reference_nodes.set(graph.get_id(handle) - shift); + }); + } + + // Prepare non reference path handles for parallel processing + std::vector non_reference_paths; + graph.for_each_path_handle([&non_reference_paths](const path_handle_t& path) { + non_reference_paths.push_back(path); + }); + std::sort(non_reference_paths.begin(), non_reference_paths.end()); + std::sort(reference_paths.begin(), reference_paths.end()); + non_reference_paths.erase( + std::remove_if(non_reference_paths.begin(), non_reference_paths.end(), + [&reference_paths](const auto &x) { + return std::binary_search(reference_paths.begin(), reference_paths.end(), x); + }), non_reference_paths.end()); + + // Traverse non reference paths to emit non-reference ranges + if (_show_step_ranges) { + std::cout << "#path.name\tstart\tend\tsteps" << std::endl; + } else { + std::cout << "#path.name\tstart\tend" << std::endl; + } + #pragma omp parallel for schedule(dynamic, 1) + for (auto& path : non_reference_paths) { + uint64_t start = 0, end = 0; + std::vector step_range; + graph.for_each_step_in_path(path, [&](const step_handle_t& step) { + const handle_t handle = graph.get_handle_of_step(step); + const uint64_t index = graph.get_id(handle) - shift; + if (reference_nodes.test(index)) { + // Emit the previous non reference range, if any + if (end > start && (end - start) >= min_size_in_bp) { + if (_show_step_ranges) { + std::string step_range_str = ""; + for (auto& step : step_range) { + const handle_t handle = graph.get_handle_of_step(step); + step_range_str += std::to_string(graph.get_id(handle)) + (graph.get_is_reverse(handle) ? "-" : "+") + ","; + } + #pragma omp critical (cout) + std::cout << graph.get_path_name(path) << "\t" << start << "\t" << end << "\t" << step_range_str.substr(0, step_range_str.size() - 1) << std::endl; // trim the trailing comma from step_range + } else { + #pragma omp critical (cout) + std::cout << graph.get_path_name(path) << "\t" << start << "\t" << end << std::endl; + } + } + end += graph.get_length(handle); + start = end; + if (_show_step_ranges) { + step_range.clear(); + } + } else { + end += graph.get_length(handle); + } + if (_show_step_ranges) { + step_range.push_back(step); + } + }); + } + } + } + return 0; }