|
15 | 15 | #include <cstdio>
|
16 | 16 | #include <cerrno>
|
17 | 17 | #include <cstdlib>
|
| 18 | +#include <regex> |
18 | 19 | #include <omp.h>
|
19 | 20 | #include <sys/stat.h>
|
20 | 21 | #include <sys/wait.h>
|
@@ -4716,12 +4717,77 @@ bool IndexRegistry::gfa_has_haplotypes(const string& filepath) {
|
4716 | 4717 | cerr << "error:[IndexRegistry] Could not open GFA file " << filepath << endl;
|
4717 | 4718 | exit(1);
|
4718 | 4719 | }
|
| 4720 | + |
| 4721 | + unordered_set<string> ref_samples; |
| 4722 | + |
| 4723 | + // to pull the value out of the ref sense tag |
| 4724 | + regex ref_tag_regex("RS:Z:([a-zA-Z0-9 ._\\-]+)"); |
| 4725 | + // to split the value into samples along whitespace |
| 4726 | + regex sample_regex("([^\\s]+)(\\s+([^\\s]+))*"); |
| 4727 | + |
4719 | 4728 | while (strm.good()) {
|
4720 | 4729 | char line_type = strm.get();
|
4721 |
| - if (line_type == 'W') { |
4722 |
| - return true; |
| 4730 | + if (line_type == 'H') { |
| 4731 | + // look for reference sense path names |
| 4732 | + string line; |
| 4733 | + getline(strm, line); |
| 4734 | + smatch tag_sub; |
| 4735 | + bool found_match = regex_search(line, tag_sub, ref_tag_regex); |
| 4736 | + if (!found_match) { |
| 4737 | + // no ref sense tag |
| 4738 | + continue; |
| 4739 | + } |
| 4740 | + string tag_value = tag_sub[1]; |
| 4741 | + smatch val_sub; |
| 4742 | + found_match = regex_search(tag_value, val_sub, sample_regex); |
| 4743 | + if (!found_match) { |
| 4744 | + // ref sense tag is malformed |
| 4745 | + cerr << tag_sub[0] << endl; |
| 4746 | + exit(1); |
| 4747 | + } |
| 4748 | + |
| 4749 | + // record the ref samples |
| 4750 | + for (size_t i = 1; i < val_sub.size(); ++i) { |
| 4751 | + string submatch = val_sub[i]; |
| 4752 | + if (isspace(submatch[0])) { |
| 4753 | + // TODO: ugly |
| 4754 | + // this is one of the splits that includes the spacer between sample names |
| 4755 | + continue; |
| 4756 | + } |
| 4757 | + ref_samples.insert(submatch); |
| 4758 | + } |
| 4759 | + } |
| 4760 | + else { |
| 4761 | + if (line_type == 'P') { |
| 4762 | + if (strm.get() != '\t') { |
| 4763 | + cerr << "error: P-line does not have tab following line type\n"; |
| 4764 | + exit(1); |
| 4765 | + } |
| 4766 | + |
| 4767 | + string path_name; |
| 4768 | + getline(strm, path_name, '\t'); |
| 4769 | + |
| 4770 | + if (PathMetadata::parse_sense(path_name) == PathSense::HAPLOTYPE) { |
| 4771 | + string sample = PathMetadata::parse_sample_name(path_name); |
| 4772 | + if (sample != PathMetadata::NO_SAMPLE_NAME || !ref_samples.count(sample)) { |
| 4773 | + return true; |
| 4774 | + } |
| 4775 | + } |
| 4776 | + } |
| 4777 | + else if (line_type == 'W') { |
| 4778 | + if (strm.get() != '\t') { |
| 4779 | + cerr << "error: W-line does not have tab following line type\n"; |
| 4780 | + exit(1); |
| 4781 | + } |
| 4782 | + |
| 4783 | + string sample; |
| 4784 | + getline(strm, sample, '\t'); |
| 4785 | + if (!ref_samples.count(sample)) { |
| 4786 | + return true; |
| 4787 | + } |
| 4788 | + } |
| 4789 | + strm.ignore(numeric_limits<streamsize>::max(), '\n'); |
4723 | 4790 | }
|
4724 |
| - strm.ignore(numeric_limits<streamsize>::max(), '\n'); |
4725 | 4791 | }
|
4726 | 4792 | return false;
|
4727 | 4793 | }
|
|
0 commit comments