Skip to content

Commit 3c2fb51

Browse files
committed
More work on integrating gaf.
1 parent 41c2198 commit 3c2fb51

File tree

5 files changed

+88
-71
lines changed

5 files changed

+88
-71
lines changed

src/alignment_format.cpp

+72-42
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,7 @@ namespace AlignFormat {
4545
cur = cur->duplicate();
4646
other_segments.push_back(cur);
4747
}
48-
cur->core.flag = (line[i] == '>') ? 0 : 16;
48+
cur->flag = (line[i] == '>') ? 0 : 16;
4949
++match_count;
5050
break;
5151
case 1:
@@ -62,16 +62,16 @@ namespace AlignFormat {
6262
break;
6363
case 2:
6464
j = line.find("-", i);
65-
std::from_chars(line.data() + i, line.data() + j, cur->core.pos);
65+
std::from_chars(line.data() + i, line.data() + j, cur->pos);
6666
++match_count;
6767
len = j - i;
6868
i += len;
6969
break;
7070
default:
7171
j = line.find_first_of("><", i);
72-
std::from_chars(line.data() + i, line.data() + j, cur->core.end);
72+
std::from_chars(line.data() + i, line.data() + j, cur->end);
7373
if (!have_cigar) {
74-
cur->blocks.emplace_back() = {(uint32_t)cur->core.pos, (uint32_t)cur->core.end};
74+
cur->blocks.emplace_back() = {(uint32_t)cur->pos, (uint32_t)cur->end};
7575
}
7676
if (j == std::string::npos) {
7777
i = pe;
@@ -89,15 +89,15 @@ namespace AlignFormat {
8989
cur = g;
9090
size_t next_segment = 0;
9191
uint32_t num = 0;
92-
uint32_t current_pos = cur->core.pos;
92+
uint32_t current_pos = cur->pos;
9393
uint32_t block_start = current_pos;
9494
bool in_match = false;
9595

9696
auto flush_block = [&]() {
9797
if (in_match && cur) {
9898
// Ensure block doesn't extend beyond segment boundary and check for overflow
99-
uint32_t end_pos = (current_pos < (uint32_t)cur->core.end) ?
100-
std::min(current_pos, (uint32_t)cur->core.end) : (uint32_t)cur->core.end;
99+
uint32_t end_pos = (current_pos < (uint32_t)cur->end) ?
100+
std::min(current_pos, (uint32_t)cur->end) : (uint32_t)cur->end;
101101
if (block_start < end_pos) {
102102
cur->blocks.emplace_back() = {block_start, end_pos};
103103
}
@@ -111,7 +111,7 @@ namespace AlignFormat {
111111
if (next) {
112112
cur = next;
113113
next_segment++;
114-
current_pos = cur->core.pos;
114+
current_pos = cur->pos;
115115
block_start = current_pos;
116116
in_match = false;
117117
return true;
@@ -136,8 +136,8 @@ namespace AlignFormat {
136136
uint32_t remaining = num;
137137
while (remaining > 0 && cur != nullptr) {
138138
// Calculate how much of the operation fits in current segment
139-
uint32_t segment_remaining = (current_pos < (uint32_t)cur->core.end) ?
140-
((uint32_t)cur->core.end - current_pos) : 0;
139+
uint32_t segment_remaining = (current_pos < (uint32_t)cur->end) ?
140+
((uint32_t)cur->end - current_pos) : 0;
141141

142142
// Protect against overflow in advance calculation
143143
uint32_t max_advance = UINT32_MAX - current_pos;
@@ -152,7 +152,7 @@ namespace AlignFormat {
152152
remaining -= advance;
153153

154154
// If we've reached the end of the segment
155-
if (current_pos >= (uint32_t)cur->core.end) {
155+
if (current_pos >= (uint32_t)cur->end) {
156156
flush_block();
157157
if (!advance_to_next_segment()) {
158158
break; // No more segments
@@ -167,16 +167,16 @@ namespace AlignFormat {
167167
flush_block();
168168
uint32_t remaining = num;
169169
while (remaining > 0 && cur != nullptr) {
170-
uint32_t segment_remaining = (current_pos < (uint32_t)cur->core.end) ?
171-
((uint32_t)cur->core.end - current_pos) : 0;
170+
uint32_t segment_remaining = (current_pos < (uint32_t)cur->end) ?
171+
((uint32_t)cur->end - current_pos) : 0;
172172

173173
uint32_t max_advance = UINT32_MAX - current_pos;
174174
uint32_t advance = std::min({remaining, segment_remaining, max_advance});
175175

176176
current_pos += advance;
177177
remaining -= advance;
178178

179-
if (current_pos >= (uint32_t)cur->core.end) {
179+
if (current_pos >= (uint32_t)cur->end) {
180180
if (!advance_to_next_segment()) {
181181
break; // No more segments
182182
}
@@ -199,12 +199,12 @@ namespace AlignFormat {
199199
flush_block();
200200
}
201201

202-
// std::cout << "g is " << g->core.pos << " - " << g->core.end << std::endl;
202+
// std::cout << "g is " << g->pos << " - " << g->end << std::endl;
203203
// for (auto item : g->blocks) {
204204
// std::cout << item.start << "-" << item.end << " ";
205205
// } std::cout << std::endl;
206206
// for (auto gg : other_segments) {
207-
// std::cout << "gg is " << gg->core.pos << " - " << gg->core.end << std::endl;
207+
// std::cout << "gg is " << gg->pos << " - " << gg->end << std::endl;
208208
// for (auto item : gg->blocks) {
209209
// std::cout << item.start << "-" << item.end << " ";
210210
// } std::cout << std::endl;
@@ -231,8 +231,7 @@ namespace AlignFormat {
231231
// ds:Z::36*ct:129*tg*tc:2+[ag]gcgcag:1*ag:228+[ag]:19+ca:4*ta:11*ga-[caggcgcagaga]ggcgcgccgcgcc[ggcg]:21*cg:4*tc:7*ta:29*ag:3*ct:2*ct:1*gc:32+[g]:33*ac:14*gt:1*tg:15*ga:28+[g]:11*ag:3*tg+[g]:33*tc:20*ag:19*ac:12*ca:223*ta:17*ag:47*cg:50*ag:33*at:22*gt:206*ga:400-[g]:24*gc:184*ga:11+[g]:388*ga:211-[c]:120*tg:1*ag:183*ct:24*gc:329-ag:237*ca:61-[c]:21*tc:365-[c]:109+[aaga]g:217*gc:25*cg:42+[g]:165*ag:22*ag:187*ag:92*tg:62*at:542*gt:2*gt:85*ac+[g]:15*ag:33*ga:111*tc:34*tg:194*ct:79*tc:155*ct:36*ga:74*ct:160*gc:1*cg:46*ag:1307*tc:684*cg:322*ag:68*ct:534*ag:351*ga:92*gc:5*tc:796*ct:89-[c]:465*ct:577*ag:23*tc:586+[g]:302*tc:83*ga:18*gt:25*ac:102+[g]:56*cg:35*ag:312*at:1*gc:170*ca:93*ga:156*ct:136*ct:453*ag:108*ag:67*cg:11*ga:3*gc:17*gt:4*gc:1*ca:15*cg:347+[g]:8*ag:10*cg*cg:21*ac:93+[a]:292*ag:142*ct:107*tc:281*ct:444*ga:284*ag:57*ga:77*ga:309*ga:166+[t]:267*ta:24
232232

233233
void gafParser(std::string& line,
234-
ankerl::unordered_dense::map< std::string, SuperIntervals<int, GAF_t *>>& cached_alignments,
235-
ankerl::unordered_dense::map< std::string, uint32_t>& rids) {
234+
ankerl::unordered_dense::map< std::string, SuperIntervals<int, GAF_t *>>& cached_alignments) {
236235
GAF_t *g = new GAF_t();
237236

238237
size_t pos = 0;
@@ -247,7 +246,8 @@ namespace AlignFormat {
247246
};
248247

249248
next = line.find('\t', pos);
250-
g->qname = line.substr(0, next);
249+
std::string qname = line.substr(0, next);
250+
251251
pos = next + 1;
252252

253253
parse_int(g->qlen);
@@ -287,7 +287,7 @@ namespace AlignFormat {
287287
pos = next + 1;
288288

289289
//12 int Mapping quality (0-255; 255 for missing)
290-
parse_int(g->core.qual);
290+
parse_int(g->qual);
291291

292292
size_t cigar_start = line.find("cg:Z:");
293293
size_t cigar_end = std::string::npos;
@@ -299,13 +299,13 @@ namespace AlignFormat {
299299
std::vector<GAF_t*> other_segments;
300300
extractAlignmentPath(line, g, ps, pe, other_segments, cigar_start, cigar_end);
301301

302-
cached_alignments[g->chrom].add(g->core.pos, g->core.end, g);
302+
cached_alignments[g->chrom].add(g->pos, g->end, g);
303303

304-
// std::cout << g->core.flag << " " << g->chrom << " " << g->core.pos << " " << g->core.end << std::endl;
305-
for (auto & v : other_segments) {
306-
cached_alignments[v->chrom].add(v->core.pos, v->core.end, g);
307-
// std::cout << v->core.flag << " " << v->chrom << " " << v->core.pos << " " << v->core.end << std::endl;
308-
}
304+
// std::cout << g->flag << " " << g->chrom << " " << g->pos << " " << g->end << std::endl;
305+
// for (auto & v : other_segments) {
306+
// cached_alignments[v->chrom].add(v->pos, v->end, g);
307+
// std::cout << v->flag << " " << v->chrom << " " << v->pos << " " << v->end << std::endl;
308+
// }
309309
// std::exit(0);
310310

311311
}
@@ -314,26 +314,26 @@ namespace AlignFormat {
314314
int start, end;
315315
};
316316

317-
void gafFindY(std::vector<GAF_t *>& gafAlignments) {
317+
void gafFindY(std::vector<AlignFormat::GAF_t *>& gafAlignments) {
318318
std::vector<TrackRange> trackLevels;
319319
for (const auto &b : gafAlignments) {
320320
size_t memLen = trackLevels.size();
321321
size_t i = 0;
322322
for (; i < memLen; ++i) {
323-
if (b->core.pos > trackLevels[i].end) {
324-
trackLevels[i].end = b->core.end + 2;
323+
if (b->pos > trackLevels[i].end) {
324+
trackLevels[i].end = b->end;
325325
b->y = (int)i;
326326
break;
327327
}
328328
}
329329
if (i == memLen) {
330-
trackLevels.emplace_back() = {b->core.pos, b->core.end + 2};
330+
trackLevels.emplace_back() = {b->pos, b->end};
331331
b->y = (int)memLen;
332332
}
333333
}
334334
}
335335

336-
void GwAlignment::open(const std::string& file_path, const std::string& reference, int threads, faidx_t* fai) {
336+
void AlignFormat::GwAlignment::open(const std::string& file_path, const std::string& reference, int threads) {
337337
path = file_path;
338338
if (Utils::endsWith(path, "bam") || Utils::endsWith(path, "cram")) {
339339
type = AlignmentType::HTSLIB_t;
@@ -371,24 +371,16 @@ namespace AlignFormat {
371371
throw std::exception();
372372
}
373373
#endif
374-
// Reference id's
375-
ankerl::unordered_dense::map< std::string, uint32_t> rids;
376-
int num_sequences = faidx_nseq(fai);
377-
for (int i = 0; i < num_sequences; ++i) {
378-
const char* seq_name = faidx_iseq(fai, i);
379-
rids[seq_name] = i;
380-
}
381374

382375
std::string tp;
383376
while (true) {
384-
auto got_line = (bool)getline(*fpu, tp);
385-
if (!got_line) {
377+
if (!(bool)getline(*fpu, tp)) {
386378
break;
387379
}
388380
if (tp[0] == '#') {
389381
continue;
390382
}
391-
gafParser(tp, this->cached_alignments, rids);
383+
gafParser(tp, this->cached_alignments);
392384
}
393385
for (auto& item : cached_alignments) {
394386
item.second.index();
@@ -397,10 +389,48 @@ namespace AlignFormat {
397389
}
398390
}
399391

400-
GwAlignment::~GwAlignment() {
392+
AlignFormat::GwAlignment::~GwAlignment() {
401393
if (index) hts_idx_destroy(index);
402394
if (header) sam_hdr_destroy(header);
403395
if (bam) sam_close(bam);
396+
if (!cached_alignments.empty()) {
397+
for (auto& item : cached_alignments) {
398+
for (auto &g : item.second.data) {
399+
delete g;
400+
}
401+
}
402+
}
403+
}
404+
405+
void gafToAlign(AlignFormat::GAF_t* gaf, AlignFormat::Align* align) {
406+
407+
align->cov_start = gaf->pos;
408+
align->cov_end = gaf->end;
409+
align->orient_pattern = AlignFormat::Pattern::NORMAL;
410+
align->left_soft_clip = 0;
411+
align->right_soft_clip = 0;
412+
align->y = gaf->y;
413+
align->edge_type = 1;
414+
align->pos = gaf->pos;
415+
align->reference_end = gaf->end;
416+
align->has_SA = false;
417+
align->blocks = gaf->blocks;
418+
std::cout << gaf->pos << " " << gaf->end << " " << gaf->y << std::endl;
419+
align->delegate = bam_init1();
420+
const char* qname = gaf->qname.c_str();
421+
size_t l_qname = gaf->qname.size();
422+
// int bam_set1(bam1_t *bam,
423+
// size_t l_qname, const char *qname,
424+
// uint16_t flag, int32_t tid, hts_pos_t pos, uint8_t mapq,
425+
// size_t n_cigar, const uint32_t *cigar,
426+
// int32_t mtid, hts_pos_t mpos, hts_pos_t isize,
427+
// size_t l_seq, const char *seq, const char *qual,
428+
// size_t l_aux);
429+
int res = bam_set1(align->delegate, l_qname, qname, (uint64_t)gaf->flag, 0,
430+
(hts_pos_t)align->pos, (uint8_t)gaf->qual, 0, nullptr, 0, 0, 0, 0, nullptr, nullptr, 0);
431+
if (res < 0) {
432+
return;
433+
}
404434
}
405435

406436
}

src/alignment_format.h

+5-11
Original file line numberDiff line numberDiff line change
@@ -82,6 +82,7 @@ namespace AlignFormat {
8282
std::vector<AlignFormat::ModItem> any_mods;
8383

8484
// Constructor
85+
Align() {}
8586
Align(bam1_t *src) { delegate = src; }
8687

8788
// Destructor
@@ -146,16 +147,11 @@ namespace AlignFormat {
146147
GAF_t
147148
};
148149

149-
class Core {
150-
public:
151-
int tid, pos, end, qual, flag, n_cigar, l_qseq, mtid, mpos;
152-
};
153-
154-
class GAF_t {
150+
EXPORT class GAF_t {
155151
public:
156152
std::string qname, chrom;
157-
Core core;
158153
int qlen, qstart, qend;
154+
int pos{0}, end{0}, qual, flag{0};
159155
char strand; // this is strand of the query, not reference
160156
int y{-1};
161157

@@ -168,9 +164,6 @@ namespace AlignFormat {
168164
}
169165
};
170166

171-
void gafParser(std::string& line,
172-
ankerl::unordered_dense::map< std::string, SuperIntervals<int, GAF_t *>>& cached_alignments);
173-
174167
EXPORT class GwAlignment {
175168
public:
176169
AlignmentType type;
@@ -186,8 +179,9 @@ namespace AlignFormat {
186179
GwAlignment() = default;
187180
~GwAlignment();
188181

189-
void open(const std::string& path, const std::string& reference, int threads, faidx_t* fai);
182+
void open(const std::string& path, const std::string& reference, int threads);
190183
};
191184

185+
void gafToAlign(AlignFormat::GAF_t* gaf, AlignFormat::Align* align);
192186

193187
}

src/hts_funcs.cpp

+6-5
Original file line numberDiff line numberDiff line change
@@ -169,24 +169,25 @@ namespace HGW {
169169
bool coverage, std::vector<Parse::Parser> &filters, BS::thread_pool &pool,
170170
const int parse_mods_threshold) {
171171

172-
std::vector<AlignFormat::GAF_t*>& readQueueGAF = col.readQueueGAF;
172+
std::vector<AlignFormat::GAF_t*> &readQueueGAF = col.readQueueGAF;
173173
if (!readQueueGAF.empty()) {
174174
readQueueGAF.clear();
175175
}
176176
if (!col.alignmentFile->cached_alignments.contains(region->chrom)) {
177177
return;
178178
}
179179
col.alignmentFile->cached_alignments[region->chrom].findOverlaps(region->start, region->end, readQueueGAF);
180+
std::reverse(readQueueGAF.begin(), readQueueGAF.end());
180181
if (coverage) {
181182
std::fill(col.covArr.begin(), col.covArr.end(), 0);
182183
for (const auto &i : readQueueGAF) {
183-
// std::cerr << i->core.pos << "-" << i->core.end << std::endl << " ";
184-
// for (auto &v : i->blocks) {
185-
// std::cerr << v.start << "," << v.end << " ";
186-
// } std::cerr << std::endl << std::endl;
187184
Segs::addToCovArray(col.covArr, i->blocks, region->start, region->end);
188185
}
189186
}
187+
col.readQueue.resize(readQueueGAF.size());
188+
for (size_t i=0; i < readQueueGAF.size(); ++i) {
189+
AlignFormat::gafToAlign(readQueueGAF[i], &col.readQueue[i]);
190+
}
190191
col.collection_processed = false;
191192
}
192193

src/plot_manager.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -96,7 +96,7 @@ namespace Manager {
9696
hts_idx_t *idx = sam_index_load(f, fn.c_str());
9797
indexes.push_back(idx);
9898

99-
inputAlignments[i].open(fn, reference, opt.threads, fai);
99+
inputAlignments[i].open(fn, reference, opt.threads);
100100
++i;
101101
}
102102

src/segments.cpp

+4-12
Original file line numberDiff line numberDiff line change
@@ -734,7 +734,7 @@ namespace Segs {
734734
pos += l;
735735
break;
736736
case BAM_CREF_SKIP:
737-
op = BAM_CDEL;
737+
// op = BAM_CDEL;
738738
pos += l;
739739
break;
740740
case BAM_CSOFT_CLIP:
@@ -763,17 +763,6 @@ namespace Segs {
763763
self->has_SA = false;
764764
}
765765

766-
// if (sort_state > 0) {
767-
// if (sort_state == Utils::SortType::HP) {
768-
// uint8_t *HP_tag = bam_aux_get(self->delegate, "HP");
769-
// self->sort_tag = (HP_tag != nullptr) ? (int) bam_aux2i(HP_tag) : 0;
770-
// } else if (sort_state == Utils::SortType::STRAND) {
771-
// self->sort_tag = (int)(src->core.flag & BAM_FREVERSE) ? 1 : 0;
772-
// } else if (sort_by_pos >= 0 && ref_base != '\0') {
773-
// self->sort_tag = getSortCode(src, sort_by_pos, ref_base);
774-
// }
775-
// }
776-
777766
if (parse_mods_threshold > 0) {
778767
hts_base_mod_state* mod_state = new hts_base_mod_state;
779768
int res = bam_parse_basemod_gw(src, mod_state, 0);
@@ -855,6 +844,9 @@ namespace Segs {
855844
}
856845
}
857846

847+
848+
849+
858850
constexpr int POS_MASK = 0b0000111;
859851

860852
void setAlignSortCode(AlignFormat::Align &a, Utils::SortType sort_state, int target_pos, char ref_base) {

0 commit comments

Comments
 (0)