From 4097440be9d749bf1cbf65322a2a3595c9df48dc Mon Sep 17 00:00:00 2001 From: AndreaGuarracino Date: Mon, 12 Feb 2024 22:01:55 +0100 Subject: [PATCH] avoid duplicated subpaths --- src/position.hpp | 9 +++++++++ src/subcommand/extract_main.cpp | 11 +++++++++++ 2 files changed, 20 insertions(+) diff --git a/src/position.hpp b/src/position.hpp index 9aa1ac4db..69078f8cf 100644 --- a/src/position.hpp +++ b/src/position.hpp @@ -88,6 +88,15 @@ struct path_range_t { std::string data; }; +struct path_range_comparator { + bool operator() (const path_range_t& lhs, const path_range_t& rhs) const { + if (lhs.begin.path != rhs.begin.path) return lhs.begin.path < rhs.begin.path; + if (lhs.end.path != rhs.end.path) return lhs.end.path < rhs.end.path; + if (lhs.begin.offset != rhs.begin.offset) return lhs.begin.offset < rhs.begin.offset; + return lhs.end.offset < rhs.end.offset; + } +}; + inline std::string& get_long_path_name(std::tuple path_long_start_end) { return std::get<0>(path_long_start_end); } diff --git a/src/subcommand/extract_main.cpp b/src/subcommand/extract_main.cpp index ce0ecc022..e900a28d8 100644 --- a/src/subcommand/extract_main.cpp +++ b/src/subcommand/extract_main.cpp @@ -556,6 +556,17 @@ namespace odgi { return std::binary_search(source_paths_from_path_ranges.begin(), source_paths_from_path_ranges.end(), x); }), source_paths->end()); + // We don't cut nodes for the extraction, so close path intervals can generate identical subpaths. + // To avoid duplicated subpaths in the final subgraph, we remove duplicated path ranges. + { + std::set unique_path_ranges; + + for (const auto& path_range : path_ranges) { + unique_path_ranges.insert(path_range); + } + + path_ranges.assign(unique_path_ranges.begin(), unique_path_ranges.end()); + } if (max_dist_subpaths > 0) { // Iterate multiple times to merge subpaths which became mergeable during the first iteration where new nodes were added