Skip to content

Commit 02bde6d

Browse files
authored
Merge pull request #4178 from vgteam/auto-p-haps
Handle P-lines that specify haplotypes in vg autoindex
2 parents 1488b76 + cf3aa54 commit 02bde6d

File tree

1 file changed

+69
-3
lines changed

1 file changed

+69
-3
lines changed

src/index_registry.cpp

Lines changed: 69 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@
1515
#include <cstdio>
1616
#include <cerrno>
1717
#include <cstdlib>
18+
#include <regex>
1819
#include <omp.h>
1920
#include <sys/stat.h>
2021
#include <sys/wait.h>
@@ -4716,12 +4717,77 @@ bool IndexRegistry::gfa_has_haplotypes(const string& filepath) {
47164717
cerr << "error:[IndexRegistry] Could not open GFA file " << filepath << endl;
47174718
exit(1);
47184719
}
4720+
4721+
unordered_set<string> ref_samples;
4722+
4723+
// to pull the value out of the ref sense tag
4724+
regex ref_tag_regex("RS:Z:([a-zA-Z0-9 ._\\-]+)");
4725+
// to split the value into samples along whitespace
4726+
regex sample_regex("([^\\s]+)(\\s+([^\\s]+))*");
4727+
47194728
while (strm.good()) {
47204729
char line_type = strm.get();
4721-
if (line_type == 'W') {
4722-
return true;
4730+
if (line_type == 'H') {
4731+
// look for reference sense path names
4732+
string line;
4733+
getline(strm, line);
4734+
smatch tag_sub;
4735+
bool found_match = regex_search(line, tag_sub, ref_tag_regex);
4736+
if (!found_match) {
4737+
// no ref sense tag
4738+
continue;
4739+
}
4740+
string tag_value = tag_sub[1];
4741+
smatch val_sub;
4742+
found_match = regex_search(tag_value, val_sub, sample_regex);
4743+
if (!found_match) {
4744+
// ref sense tag is malformed
4745+
cerr << tag_sub[0] << endl;
4746+
exit(1);
4747+
}
4748+
4749+
// record the ref samples
4750+
for (size_t i = 1; i < val_sub.size(); ++i) {
4751+
string submatch = val_sub[i];
4752+
if (isspace(submatch[0])) {
4753+
// TODO: ugly
4754+
// this is one of the splits that includes the spacer between sample names
4755+
continue;
4756+
}
4757+
ref_samples.insert(submatch);
4758+
}
4759+
}
4760+
else {
4761+
if (line_type == 'P') {
4762+
if (strm.get() != '\t') {
4763+
cerr << "error: P-line does not have tab following line type\n";
4764+
exit(1);
4765+
}
4766+
4767+
string path_name;
4768+
getline(strm, path_name, '\t');
4769+
4770+
if (PathMetadata::parse_sense(path_name) == PathSense::HAPLOTYPE) {
4771+
string sample = PathMetadata::parse_sample_name(path_name);
4772+
if (sample != PathMetadata::NO_SAMPLE_NAME || !ref_samples.count(sample)) {
4773+
return true;
4774+
}
4775+
}
4776+
}
4777+
else if (line_type == 'W') {
4778+
if (strm.get() != '\t') {
4779+
cerr << "error: W-line does not have tab following line type\n";
4780+
exit(1);
4781+
}
4782+
4783+
string sample;
4784+
getline(strm, sample, '\t');
4785+
if (!ref_samples.count(sample)) {
4786+
return true;
4787+
}
4788+
}
4789+
strm.ignore(numeric_limits<streamsize>::max(), '\n');
47234790
}
4724-
strm.ignore(numeric_limits<streamsize>::max(), '\n');
47254791
}
47264792
return false;
47274793
}

0 commit comments

Comments
 (0)