Skip to content

Commit edd5441

Browse files
authored
Merge pull request #21 from iqbal-lab-org/fix/20
Now enabling querying reads from fastq.gz files
2 parents e7eb5c0 + 8a08867 commit edd5441

File tree

6 files changed

+324
-39
lines changed

6 files changed

+324
-39
lines changed

CMakeLists.txt

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -189,6 +189,9 @@ endif()
189189
add_subdirectory(extlib/tlx)
190190
set(COBS_LINK_LIBRARIES tlx ${COBS_LINK_LIBRARIES})
191191

192+
### use kseq ###
193+
include_directories(extlib/kseq)
194+
192195
################################################################################
193196
### Descend into Subdirectories
194197

cobs/query/search.hpp

Lines changed: 14 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,9 @@
1919
#include <cobs/file/classic_index_header.hpp>
2020
#include <cobs/query/compact_index/mmap_search_file.hpp>
2121
#include <cobs/query/classic_index/mmap_search_file.hpp>
22-
22+
#include <zlib.h>
23+
#include <kseq.h>
24+
KSEQ_INIT(gzFile, gzread)
2325

2426

2527
namespace cobs {
@@ -106,43 +108,25 @@ static inline void process_query(
106108
output_stream << res.doc_name << '\t' << res.score << '\n';
107109
}
108110
} else if (!query_file.empty()) {
109-
std::ifstream qf(query_file);
110-
std::string line, query, comment;
111-
112-
while (std::getline(qf, line)) {
113-
if (line.empty())
114-
continue;
115-
116-
if (line[0] == '>' || line[0] == ';') {
117-
if (!query.empty()) {
118-
// perform query
119-
s.search(query, result, threshold, num_results);
120-
output_stream << comment << '\t' << result.size() << '\n';
121-
122-
for (const auto &res: result) {
123-
output_stream << res.doc_name << '\t' << res.score << '\n';
124-
}
125-
}
126-
127-
// clear and copy query comment
128-
line[0] = '*';
129-
query.clear();
130-
comment = line;
131-
continue;
132-
} else {
133-
query += line;
134-
}
135-
}
111+
gzFile fp = gzopen(query_file.c_str(), "r");
112+
if (!fp)
113+
die("Could not open query file: " + query_file);
114+
115+
kseq_t *seq = kseq_init(fp);
116+
while (kseq_read(seq) >= 0) {
117+
std::string comment(seq->name.s ? seq->name.s : "");
118+
std::string query(seq->seq.s ? seq->seq.s : "");
136119

137-
if (!query.empty()) {
138120
// perform query
139121
s.search(query, result, threshold, num_results);
140-
output_stream << comment << '\t' << result.size() << '\n';
122+
output_stream << "*" << comment << '\t' << result.size() << '\n';
141123

142124
for (const auto &res: result) {
143125
output_stream << res.doc_name << '\t' << res.score << '\n';
144126
}
145127
}
128+
kseq_destroy(seq);
129+
gzclose(fp);
146130
} else {
147131
die("Pass a verbatim query or a query file.");
148132
}

extlib/kseq/kseq.h

Lines changed: 248 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,248 @@
1+
/* The MIT License
2+
3+
Copyright (c) 2008, 2009, 2011 Attractive Chaos <attractor@live.co.uk>
4+
5+
Permission is hereby granted, free of charge, to any person obtaining
6+
a copy of this software and associated documentation files (the
7+
"Software"), to deal in the Software without restriction, including
8+
without limitation the rights to use, copy, modify, merge, publish,
9+
distribute, sublicense, and/or sell copies of the Software, and to
10+
permit persons to whom the Software is furnished to do so, subject to
11+
the following conditions:
12+
13+
The above copyright notice and this permission notice shall be
14+
included in all copies or substantial portions of the Software.
15+
16+
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17+
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18+
MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
19+
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
20+
BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
21+
ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
22+
CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23+
SOFTWARE.
24+
*/
25+
26+
/* Last Modified: 2017-02-11 */
27+
28+
#ifndef AC_KSEQ_H
29+
#define AC_KSEQ_H
30+
31+
#include <ctype.h>
32+
#include <stdint.h>
33+
#include <string.h>
34+
#include <stdlib.h>
35+
36+
#define KS_SEP_SPACE 0 // isspace(): \t, \n, \v, \f, \r
37+
#define KS_SEP_TAB 1 // isspace() && !' '
38+
#define KS_SEP_LINE 2 // line separator: "\n" (Unix) or "\r\n" (Windows)
39+
#define KS_SEP_MAX 2
40+
41+
#define __KS_TYPE(type_t) \
42+
typedef struct __kstream_t { \
43+
unsigned char *buf; \
44+
int begin, end, is_eof; \
45+
type_t f; \
46+
} kstream_t;
47+
48+
#define ks_err(ks) ((ks)->end < 0)
49+
#define ks_eof(ks) ((ks)->is_eof && (ks)->begin >= (ks)->end)
50+
#define ks_rewind(ks) ((ks)->is_eof = (ks)->begin = (ks)->end = 0)
51+
52+
#define __KS_BASIC(type_t, __bufsize) \
53+
static inline kstream_t *ks_init(type_t f) \
54+
{ \
55+
kstream_t *ks = (kstream_t*)calloc(1, sizeof(kstream_t)); \
56+
ks->f = f; \
57+
ks->buf = (unsigned char*)malloc(__bufsize); \
58+
return ks; \
59+
} \
60+
static inline void ks_destroy(kstream_t *ks) \
61+
{ \
62+
if (ks) { \
63+
free(ks->buf); \
64+
free(ks); \
65+
} \
66+
}
67+
68+
#define __KS_GETC(__read, __bufsize) \
69+
static inline int ks_getc(kstream_t *ks) \
70+
{ \
71+
if (ks_err(ks)) return -3; \
72+
if (ks_eof(ks)) return -1; \
73+
if (ks->begin >= ks->end) { \
74+
ks->begin = 0; \
75+
ks->end = __read(ks->f, ks->buf, __bufsize); \
76+
if (ks->end == 0) { ks->is_eof = 1; return -1; } \
77+
else if (ks->end < 0) { ks->is_eof = 1; return -3; } \
78+
} \
79+
return (int)ks->buf[ks->begin++]; \
80+
}
81+
82+
#ifndef KSTRING_T
83+
#define KSTRING_T kstring_t
84+
typedef struct __kstring_t {
85+
size_t l, m;
86+
char *s;
87+
} kstring_t;
88+
#endif
89+
90+
#ifndef kroundup32
91+
#define kroundup32(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, ++(x))
92+
#endif
93+
94+
#ifndef kroundup64
95+
#define kroundup64(x) (--(x), (x)|=(x)>>1, (x)|=(x)>>2, (x)|=(x)>>4, (x)|=(x)>>8, (x)|=(x)>>16, (x)|=(x)>>32, ++(x))
96+
#endif
97+
98+
#define __KS_GETUNTIL(__read, __bufsize) \
99+
static int64_t ks_getuntil2(kstream_t *ks, int delimiter, kstring_t *str, int *dret, int append) \
100+
{ \
101+
int gotany = 0; \
102+
if (dret) *dret = 0; \
103+
str->l = append? str->l : 0; \
104+
for (;;) { \
105+
int i; \
106+
if (ks_err(ks)) return -3; \
107+
if (ks->begin >= ks->end) { \
108+
if (!ks->is_eof) { \
109+
ks->begin = 0; \
110+
ks->end = __read(ks->f, ks->buf, __bufsize); \
111+
if (ks->end == 0) { ks->is_eof = 1; break; } \
112+
if (ks->end == -1) { ks->is_eof = 1; return -3; } \
113+
} else break; \
114+
} \
115+
if (delimiter == KS_SEP_LINE) { \
116+
unsigned char *sep = (unsigned char*)memchr(ks->buf + ks->begin, '\n', ks->end - ks->begin); \
117+
i = sep != NULL ? sep - ks->buf : ks->end; \
118+
} else if (delimiter > KS_SEP_MAX) { \
119+
for (i = ks->begin; i < ks->end; ++i) \
120+
if (ks->buf[i] == delimiter) break; \
121+
} else if (delimiter == KS_SEP_SPACE) { \
122+
for (i = ks->begin; i < ks->end; ++i) \
123+
if (isspace(ks->buf[i])) break; \
124+
} else if (delimiter == KS_SEP_TAB) { \
125+
for (i = ks->begin; i < ks->end; ++i) \
126+
if (isspace(ks->buf[i]) && ks->buf[i] != ' ') break; \
127+
} else i = 0; /* never come to here! */ \
128+
if (str->m - str->l < (size_t)(i - ks->begin + 1)) { \
129+
str->m = str->l + (i - ks->begin) + 1; \
130+
kroundup64(str->m); \
131+
str->s = (char*)realloc(str->s, str->m); \
132+
} \
133+
gotany = 1; \
134+
memcpy(str->s + str->l, ks->buf + ks->begin, i - ks->begin); \
135+
str->l = str->l + (i - ks->begin); \
136+
ks->begin = i + 1; \
137+
if (i < ks->end) { \
138+
if (dret) *dret = ks->buf[i]; \
139+
break; \
140+
} \
141+
} \
142+
if (!gotany && ks_eof(ks)) return -1; \
143+
if (str->s == 0) { \
144+
str->m = 1; \
145+
str->s = (char*)calloc(1, 1); \
146+
} else if (delimiter == KS_SEP_LINE && str->l > 1 && str->s[str->l-1] == '\r') --str->l; \
147+
str->s[str->l] = '\0'; \
148+
return str->l; \
149+
} \
150+
static inline int ks_getuntil(kstream_t *ks, int delimiter, kstring_t *str, int *dret) \
151+
{ return ks_getuntil2(ks, delimiter, str, dret, 0); }
152+
153+
#define KSTREAM_INIT(type_t, __read, __bufsize) \
154+
__KS_TYPE(type_t) \
155+
__KS_BASIC(type_t, __bufsize) \
156+
__KS_GETC(__read, __bufsize) \
157+
__KS_GETUNTIL(__read, __bufsize)
158+
159+
#define kseq_rewind(ks) ((ks)->last_char = (ks)->f->is_eof = (ks)->f->begin = (ks)->f->end = 0)
160+
161+
#define __KSEQ_BASIC(SCOPE, type_t) \
162+
SCOPE kseq_t *kseq_init(type_t fd) \
163+
{ \
164+
kseq_t *s = (kseq_t*)calloc(1, sizeof(kseq_t)); \
165+
s->f = ks_init(fd); \
166+
return s; \
167+
} \
168+
SCOPE void kseq_destroy(kseq_t *ks) \
169+
{ \
170+
if (!ks) return; \
171+
free(ks->name.s); free(ks->comment.s); free(ks->seq.s); free(ks->qual.s); \
172+
ks_destroy(ks->f); \
173+
free(ks); \
174+
}
175+
176+
/* Return value:
177+
>=0 length of the sequence (normal)
178+
-1 end-of-file
179+
-2 truncated quality string
180+
-3 error reading stream
181+
*/
182+
#define __KSEQ_READ(SCOPE) \
183+
SCOPE int64_t kseq_read(kseq_t *seq) \
184+
{ \
185+
int c,r; \
186+
kstream_t *ks = seq->f; \
187+
if (seq->last_char == 0) { /* then jump to the next header line */ \
188+
while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '@'); \
189+
if (c < 0) return c; /* end of file or error*/ \
190+
seq->last_char = c; \
191+
} /* else: the first header char has been read in the previous call */ \
192+
seq->comment.l = seq->seq.l = seq->qual.l = 0; /* reset all members */ \
193+
if ((r=ks_getuntil(ks, 0, &seq->name, &c)) < 0) return r; /* normal exit: EOF or error */ \
194+
if (c != '\n') ks_getuntil(ks, KS_SEP_LINE, &seq->comment, 0); /* read FASTA/Q comment */ \
195+
if (seq->seq.s == 0) { /* we can do this in the loop below, but that is slower */ \
196+
seq->seq.m = 256; \
197+
seq->seq.s = (char*)malloc(seq->seq.m); \
198+
} \
199+
while ((c = ks_getc(ks)) >= 0 && c != '>' && c != '+' && c != '@') { \
200+
if (c == '\n') continue; /* skip empty lines */ \
201+
seq->seq.s[seq->seq.l++] = c; /* this is safe: we always have enough space for 1 char */ \
202+
ks_getuntil2(ks, KS_SEP_LINE, &seq->seq, 0, 1); /* read the rest of the line */ \
203+
} \
204+
if (c == '>' || c == '@') seq->last_char = c; /* the first header char has been read */ \
205+
if (seq->seq.l + 1 >= seq->seq.m) { /* seq->seq.s[seq->seq.l] below may be out of boundary */ \
206+
seq->seq.m = seq->seq.l + 2; \
207+
kroundup64(seq->seq.m); /* rounded to the next closest 2^k */ \
208+
seq->seq.s = (char*)realloc(seq->seq.s, seq->seq.m); \
209+
} \
210+
seq->seq.s[seq->seq.l] = 0; /* null terminated string */ \
211+
seq->is_fastq = (c == '+'); \
212+
if (!seq->is_fastq) return seq->seq.l; /* FASTA */ \
213+
if (seq->qual.m < seq->seq.m) { /* allocate memory for qual in case insufficient */ \
214+
seq->qual.m = seq->seq.m; \
215+
seq->qual.s = (char*)realloc(seq->qual.s, seq->qual.m); \
216+
} \
217+
while ((c = ks_getc(ks)) >= 0 && c != '\n'); /* skip the rest of '+' line */ \
218+
if (c == -1) return -2; /* error: no quality string */ \
219+
while ((c = ks_getuntil2(ks, KS_SEP_LINE, &seq->qual, 0, 1) >= 0 && seq->qual.l < seq->seq.l)); \
220+
if (c == -3) return -3; /* stream error */ \
221+
seq->last_char = 0; /* we have not come to the next header line */ \
222+
if (seq->seq.l != seq->qual.l) return -2; /* error: qual string is of a different length */ \
223+
return seq->seq.l; \
224+
}
225+
226+
#define __KSEQ_TYPE(type_t) \
227+
typedef struct { \
228+
kstring_t name, comment, seq, qual; \
229+
int last_char, is_fastq; \
230+
kstream_t *f; \
231+
} kseq_t;
232+
233+
#define KSEQ_INIT2(SCOPE, type_t, __read) \
234+
KSTREAM_INIT(type_t, __read, 16384) \
235+
__KSEQ_TYPE(type_t) \
236+
__KSEQ_BASIC(SCOPE, type_t) \
237+
__KSEQ_READ(SCOPE)
238+
239+
#define KSEQ_INIT(type_t, __read) KSEQ_INIT2(static, type_t, __read)
240+
241+
#define KSEQ_DECLARE(type_t) \
242+
__KS_TYPE(type_t) \
243+
__KSEQ_TYPE(type_t) \
244+
extern kseq_t *kseq_init(type_t fd); \
245+
void kseq_destroy(kseq_t *ks); \
246+
int64_t kseq_read(kseq_t *seq);
247+
248+
#endif
Binary file not shown.

tests/data/utils/fasta_to_fastq.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
import argparse
2+
import random
3+
4+
def read_fasta(filename):
5+
with open(filename, 'r') as file:
6+
sequences = []
7+
sequence = ''
8+
header = ''
9+
for line in file:
10+
line = line.strip()
11+
if line.startswith('>'):
12+
if sequence:
13+
sequences.append((header, sequence))
14+
header = line[1:]
15+
sequence = ''
16+
else:
17+
sequence += line
18+
if sequence:
19+
sequences.append((header, sequence))
20+
return sequences
21+
22+
def generate_random_quality(length):
23+
return ''.join(chr(random.randint(33, 73)) for _ in range(length))
24+
25+
def wrap_text(text, width=80):
26+
return '\n'.join([text[i:i+width] for i in range(0, len(text), width)])
27+
28+
def write_fastq(sequences, filename, width=80):
29+
with open(filename, 'w') as file:
30+
for header, sequence in sequences:
31+
quality = generate_random_quality(len(sequence))
32+
sequence_wrapped = wrap_text(sequence, width)
33+
quality_wrapped = wrap_text(quality, width)
34+
file.write(f'@{header}\n{sequence_wrapped}\n+\n{quality_wrapped}\n')
35+
36+
def main():
37+
parser = argparse.ArgumentParser(description='Convert FASTA to FASTQ format with random quality scores.')
38+
parser.add_argument('input_fasta', help='Path to the input FASTA file')
39+
parser.add_argument('output_fastq', help='Path to the output FASTQ file')
40+
args = parser.parse_args()
41+
42+
sequences = read_fasta(args.input_fasta)
43+
write_fastq(sequences, args.output_fastq, width=80)
44+
45+
print(f'{len(sequences)} sequences have been converted from FASTA to FASTQ format.')
46+
47+
if __name__ == '__main__':
48+
main()

0 commit comments

Comments
 (0)