-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstream-tokenizer.cc
72 lines (63 loc) · 1.85 KB
/
stream-tokenizer.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
/**
* Provides the implementation of the StreamTokenizer method set, which
* operates on C++ strings, but is sensitive to the possibility that the
* characters arrays inside are UTF8 encodings of (in some cases, multi-byte)
* characters.
*/
#include <istream>
#include <string>
#include "stream-tokenizer.h"
#include <libxml/xmlstring.h>
using namespace std;
StreamTokenizer::StreamTokenizer(istream& is,
const string& delimiters,
bool skipDelimiters) :
is(is), delimiters(delimiters), skipDelimiters(skipDelimiters) {
}
bool StreamTokenizer::hasMoreTokens() const {
if (skipDelimiters) {
while (true) {
string ch = getNextXMLChar();
if (ch.empty()) return false;
if (xmlStrstr(BAD_CAST delimiters.c_str(), BAD_CAST ch.c_str()) == NULL) {
savedChar = ch;
return true;
}
}
}
if (!savedChar.empty()) return true;
savedChar = getNextXMLChar();
return !is.fail();
}
string StreamTokenizer::nextToken() {
if (!hasMoreTokens()) return "";
string token;
string ch = getNextXMLChar();
token += ch;
if (xmlStrstr(BAD_CAST delimiters.c_str(), BAD_CAST ch.c_str()) != NULL) return token;
while (true) {
ch = getNextXMLChar();
if (ch.empty() || xmlStrstr(BAD_CAST delimiters.c_str(), BAD_CAST ch.c_str()) != NULL) break;
token += ch;
}
if (!ch.empty()) savedChar = ch;
return token;
}
string StreamTokenizer::getNextXMLChar() const {
if (!savedChar.empty()) {
string nextChar = savedChar;
savedChar = "";
return nextChar;
}
const size_t kMaxUTF8CharBytes = 4;
char buffer[kMaxUTF8CharBytes + 1] = {0, 0, 0, 0, 0};
size_t pos = 0;
do {
char ch = is.get();
if (is.fail()) return "";
buffer[pos++] = ch;
} while (pos < kMaxUTF8CharBytes && !xmlCheckUTF8(BAD_CAST buffer));
if (xmlCheckUTF8(BAD_CAST buffer))
return string(buffer, buffer + pos);
return "";
}