-
Notifications
You must be signed in to change notification settings - Fork 443
/
header.h
319 lines (270 loc) · 10.2 KB
/
header.h
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
/*
Copyright (c) 2013-2019 Genome Research Ltd.
Authors: James Bonfield <jkb@sanger.ac.uk>, Valeriu Ohan <vo2@sanger.ac.uk>
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice,
this list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
Institute nor the names of its contributors may be used to endorse or promote
products derived from this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH LTD OR CONTRIBUTORS BE LIABLE
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
*/
/*! \file
* SAM header parsing.
*
* These functions can be shared between SAM, BAM and CRAM file
* formats as all three internally use the same string encoding for
* header fields.
*/
#ifndef HEADER_H_
#define HEADER_H_
#include <stdarg.h>
#include "cram/string_alloc.h"
#include "cram/pooled_alloc.h"
#include "htslib/khash.h"
#include "htslib/kstring.h"
#include "htslib/sam.h"
#ifdef __cplusplus
extern "C" {
#endif
/*! Make a single integer out of a two-letter type code */
static inline khint32_t TYPEKEY(const char *type) {
unsigned int u0 = (unsigned char) type[0];
unsigned int u1 = (unsigned char) type[1];
return (u0 << 8) | u1;
}
/*
* Proposed new SAM header parsing
1 @SQ ID:foo LN:100
2 @SQ ID:bar LN:200
3 @SQ ID:ram LN:300 UR:xyz
4 @RG ID:r ...
5 @RG ID:s ...
Hash table for 2-char @keys without dup entries.
If dup lines, we form a circular linked list. Ie hash keys = {RG, SQ}.
HASH("SQ")--\
|
(3) <-> 1 <-> 2 <-> 3 <-> (1)
HASH("RG")--\
|
(5) <-> 4 <-> 5 <-> (4)
Items stored in the hash values also form their own linked lists:
Ie SQ->ID(foo)->LN(100)
SQ->ID(bar)->LN(200)
SQ->ID(ram)->LN(300)->UR(xyz)
RG->ID(r)
*/
/*! A single key:value pair on a header line
*
* These form a linked list and hold strings. The strings are
* allocated from a string_alloc_t pool referenced in the master
* sam_hrecs_t structure. Do not attempt to free, malloc or manipulate
* these strings directly.
*/
typedef struct sam_hrec_tag_s {
struct sam_hrec_tag_s *next;
const char *str;
int len;
} sam_hrec_tag_t;
/*! The parsed version of the SAM header string.
*
* Each header type (SQ, RG, HD, etc) points to its own sam_hdr_type
* struct via the main hash table h in the sam_hrecs_t struct.
*
* These in turn consist of circular bi-directional linked lists (ie
* rings) to hold the multiple instances of the same header type
* code. For example if we have 5 \@SQ lines the primary hash table
* will key on \@SQ pointing to the first sam_hdr_type and that in turn
* will be part of a ring of 5 elements.
*
* For each sam_hdr_type structure we also point to a sam_hdr_tag
* structure which holds the tokenised attributes; the tab separated
* key:value pairs per line.
*/
typedef struct sam_hrec_type_s {
struct sam_hrec_type_s *next; // circular list of this type
struct sam_hrec_type_s *prev; // circular list of this type
struct sam_hrec_type_s *global_next; // circular list of all lines
struct sam_hrec_type_s *global_prev; // circular list of all lines
sam_hrec_tag_t *tag; // first tag
khint32_t type; // Two-letter type code as an int
} sam_hrec_type_t;
/*! Parsed \@SQ lines */
typedef struct {
const char *name;
hts_pos_t len;
sam_hrec_type_t *ty;
} sam_hrec_sq_t;
/*! Parsed \@RG lines */
typedef struct {
const char *name;
sam_hrec_type_t *ty;
int name_len;
int id; // numerical ID
} sam_hrec_rg_t;
/*! Parsed \@PG lines */
typedef struct {
const char *name;
sam_hrec_type_t *ty;
int name_len;
int id; // numerical ID
int prev_id; // -1 if none
} sam_hrec_pg_t;
/*! Sort order parsed from @HD line */
enum sam_sort_order {
ORDER_UNKNOWN =-1,
ORDER_UNSORTED = 0,
ORDER_NAME = 1,
ORDER_COORD = 2
//ORDER_COLLATE = 3 // maybe one day!
};
enum sam_group_order {
ORDER_NONE =-1,
ORDER_QUERY = 0,
ORDER_REFERENCE = 1
};
KHASH_MAP_INIT_INT(sam_hrecs_t, sam_hrec_type_t*)
KHASH_MAP_INIT_STR(m_s2i, int)
/*! Primary structure for header manipulation
*
* The initial header text is held in the text kstring_t, but is also
* parsed out into SQ, RG and PG arrays. These have a hash table
* associated with each to allow lookup by ID or SN fields instead of
* their numeric array indices. Additionally PG has an array to hold
* the linked list start points (the last in a PP chain).
*
* Use the appropriate sam_hdr_* functions to edit the header, and
* call sam_hdr_rebuild() any time the textual form needs to be
* updated again.
*/
struct sam_hrecs_t {
khash_t(sam_hrecs_t) *h;
sam_hrec_type_t *first_line; //!< First line (usually @HD)
string_alloc_t *str_pool; //!< Pool of sam_hdr_tag->str strings
pool_alloc_t *type_pool;//!< Pool of sam_hdr_type structs
pool_alloc_t *tag_pool; //!< Pool of sam_hdr_tag structs
// @SQ lines / references
int nref; //!< Number of \@SQ lines
int ref_sz; //!< Number of entries available in ref[]
sam_hrec_sq_t *ref; //!< Array of parsed \@SQ lines
khash_t(m_s2i) *ref_hash; //!< Maps SQ SN field to ref[] index
// @RG lines / read-groups
int nrg; //!< Number of \@RG lines
int rg_sz; //!< number of entries available in rg[]
sam_hrec_rg_t *rg; //!< Array of parsed \@RG lines
khash_t(m_s2i) *rg_hash; //!< Maps RG ID field to rg[] index
// @PG lines / programs
int npg; //!< Number of \@PG lines
int pg_sz; //!< Number of entries available in pg[]
int npg_end; //!< Number of terminating \@PG lines
int npg_end_alloc; //!< Size of pg_end field
sam_hrec_pg_t *pg; //!< Array of parsed \@PG lines
khash_t(m_s2i) *pg_hash; //!< Maps PG ID field to pg[] index
int *pg_end; //!< \@PG chain termination IDs
// @cond internal
char *ID_buf; // temporary buffer for sam_hdr_pg_id
uint32_t ID_buf_sz;
int ID_cnt;
// @endcond
int dirty; // marks the header as modified, so it can be rebuilt
int refs_changed; // Index of first changed ref (-1 if unchanged)
int pgs_changed; // New PG line added
int type_count;
char (*type_order)[3];
};
/*!
* Method for parsing the header text and populating the
* internal hash tables. After calling this method, the
* parsed representation becomes the single source of truth.
*
* @param bh Header structure, previously initialised by a
* sam_hdr_init call
* @return 0 on success, -1 on failure
*/
int sam_hdr_fill_hrecs(sam_hdr_t *bh);
/*!
* Reconstructs the text representation of the header from
* the hash table data after a change has been performed on
* the header.
*
* @return 0 on success, -1 on failure
*/
int sam_hdr_rebuild(sam_hdr_t *bh);
/*! Creates an empty SAM header, ready to be populated.
*
* @return
* Returns a sam_hrecs_t struct on success (free with sam_hrecs_free())
* NULL on failure
*/
sam_hrecs_t *sam_hrecs_new(void);
/*! Produces a duplicate copy of hrecs and returns it.
* @return
* Returns NULL on failure
*/
sam_hrecs_t *sam_hrecs_dup(sam_hrecs_t *hrecs);
/*! Update sam_hdr_t target_name and target_len arrays
*
* sam_hdr_t and sam_hrecs_t are specified separately so that sam_hdr_dup
* can use it to construct target arrays from the source header.
*
* @return 0 on success; -1 on failure
*/
int sam_hdr_update_target_arrays(sam_hdr_t *bh, const sam_hrecs_t *hrecs,
int refs_changed);
/*! Reconstructs a kstring from the header hash table.
*
* @return
* Returns 0 on success
* -1 on failure
*/
int sam_hrecs_rebuild_text(const sam_hrecs_t *hrecs, kstring_t *ks);
/*! Deallocates all storage used by a sam_hrecs_t struct.
*
* This also decrements the header reference count. If after decrementing
* it is still non-zero then the header is assumed to be in use by another
* caller and the free is not done.
*/
void sam_hrecs_free(sam_hrecs_t *hrecs);
/*!
* @return
* Returns the first header item matching 'type'. If ID is non-NULL it checks
* for the tag ID: and compares against the specified ID.
*
* Returns NULL if no type/ID is found
*/
sam_hrec_type_t *sam_hrecs_find_type_id(sam_hrecs_t *hrecs, const char *type,
const char *ID_key, const char *ID_value);
sam_hrec_tag_t *sam_hrecs_find_key(sam_hrec_type_t *type,
const char *key,
sam_hrec_tag_t **prev);
int sam_hrecs_remove_key(sam_hrecs_t *hrecs,
sam_hrec_type_t *type,
const char *key);
/*! Looks up a read-group by name and returns a pointer to the start of the
* associated tag list.
*
* @return
* Returns NULL on failure
*/
sam_hrec_rg_t *sam_hrecs_find_rg(sam_hrecs_t *hrecs, const char *rg);
/*! Returns the sort order from the @HD SO: field */
enum sam_sort_order sam_hrecs_sort_order(sam_hrecs_t *hrecs);
/*! Returns the group order from the @HD SO: field */
enum sam_group_order sam_hrecs_group_order(sam_hrecs_t *hrecs);
#ifdef __cplusplus
}
#endif
#endif /* HEADER_H_ */