From 92d4953e0d093a5e59cc6f8922f8463e0ce352f8 Mon Sep 17 00:00:00 2001 From: Adam Novak Date: Fri, 4 Aug 2023 14:09:07 -0400 Subject: [PATCH] Handle two-part contig names again and put them under test --- deps/libhandlegraph | 2 +- src/unittest/handle.cpp | 28 ++++++++++++++++++++++++++ test/graphs/gfa_two_part_reference.gfa | 20 ++++++++++++++++++ test/t/48_vg_convert.t | 9 +++++++-- 4 files changed, 56 insertions(+), 3 deletions(-) create mode 100644 test/graphs/gfa_two_part_reference.gfa diff --git a/deps/libhandlegraph b/deps/libhandlegraph index 9c49c797e39..0b519b72bec 160000 --- a/deps/libhandlegraph +++ b/deps/libhandlegraph @@ -1 +1 @@ -Subproject commit 9c49c797e3949378136539326fa01eb7e473a09d +Subproject commit 0b519b72becbeb8f56f0e3478a1aef54fa241106 diff --git a/src/unittest/handle.cpp b/src/unittest/handle.cpp index 465a5cff8e0..77bfde3045c 100644 --- a/src/unittest/handle.cpp +++ b/src/unittest/handle.cpp @@ -15,6 +15,7 @@ #include "bdsg/hash_graph.hpp" #include +#include #include #include @@ -2542,5 +2543,32 @@ TEST_CASE("handlegraph PathMetadata name format preserves ranges on generic path REQUIRE(subrange.second == PathMetadata::NO_END_POSITION); } +TEST_CASE("handlegraph PathMetadata name format can parse two-part names", "[handle]") { + std::string path_name = "GRCh38#chr1"; + + PathSense sense; + string sample; + string locus; + size_t haplotype; + size_t phase_block; + subrange_t subrange; + PathMetadata::parse_path_name(path_name, + sense, + sample, + locus, + haplotype, + phase_block, + subrange); + + REQUIRE(PathMetadata::parse_sample_name(path_name) == sample); + REQUIRE(PathMetadata::parse_locus_name(path_name) == locus); + + REQUIRE(sense == PathSense::REFERENCE); + REQUIRE(sample == "GRCh38"); + REQUIRE(locus == "chr1"); + REQUIRE(phase_block == PathMetadata::NO_PHASE_BLOCK); + REQUIRE(subrange == PathMetadata::NO_SUBRANGE); +} + } } diff --git a/test/graphs/gfa_two_part_reference.gfa b/test/graphs/gfa_two_part_reference.gfa new file mode 100644 index 00000000000..742976e19fd --- /dev/null +++ b/test/graphs/gfa_two_part_reference.gfa @@ -0,0 +1,20 @@ +H VN:Z:1.1 RS:Z:GRCh37 GRCh38 +S 1 G +S 2 A +S 4 GGG +S 5 T +S 6 A +S 7 C +S 8 A +S 9 A +L 1 + 2 + 0M +L 1 + 4 + 0M +L 2 + 4 + 0M +L 4 + 5 + 0M +L 5 + 6 + 0M +L 6 + 7 + 0M +L 6 + 8 + 0M +L 7 + 9 + 0M +L 8 + 9 + 0M +P GRCh38#chr1 1+,4+,5+,6+,7+,9+ *,*,*,*,* +P GRCh37#chr1 1+,2+,4+,5+,6+,8+,9+ *,*,*,*,*,* diff --git a/test/t/48_vg_convert.t b/test/t/48_vg_convert.t index 1131f983499..04fbb81803d 100644 --- a/test/t/48_vg_convert.t +++ b/test/t/48_vg_convert.t @@ -7,7 +7,7 @@ PATH=../bin:$PATH # for vg export LC_ALL="C" # force a consistent sort order -plan tests 100 +plan tests 102 vg construct -r complex/c.fa -v complex/c.vcf.gz > c.vg cat <(vg view c.vg | grep ^S | sort) <(vg view c.vg | grep L | uniq | wc -l) <(vg paths -v c.vg -E) > c.info @@ -411,8 +411,13 @@ vg convert -a graphs/components_paths_rgfa.gfa > components_paths_rgfa.hg is "${?}" "0" "GFA -> HashGraph conversion works with redundant paths" is "$(vg paths --list -x components_paths_rgfa.hg | wc -l)" "1" "GFA -> HashGraph conversion with redundant paths keeps one copy of the redundant path" +# We should be able to handle pseudo-PanSN paths where there is no haplotype +vg convert -a graphs/gfa_two_part_reference.gfa > gfa_two_part_reference.hg +is "${?}" "0" "GFA -> HashGraph conversion works with two-part reference path names" +is "$(vg paths -M -x gfa_two_part_reference.hg | grep REFERENCE | wc -l)" "2" "GFA -> HashGraph conversion with with two-part reference path names gets the right paths" + rm -f paths.truth.txt paths.gbz.txt paths.gfa.txt paths.hg.txt -rm -f gfa_with_reference.gbz rgfa_with_reference.gbz gfa_with_reference.hg components_paths_rgfa.hg rgfa_with_reference.hg extracted.gfa +rm -f gfa_with_reference.gbz rgfa_with_reference.gbz gfa_with_reference.hg components_paths_rgfa.hg gfa_two_part_reference.hg rgfa_with_reference.hg extracted.gfa ##### # GFA Streaming