-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathhtml-document.cc
57 lines (49 loc) · 1.92 KB
/
html-document.cc
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
/**
* Presents the implementation for the HTMLDocument class, which relies
* on libxml2's ability to parse a single document and extract the textual content
* from the body tag.
*/
#include <iostream>
#include <vector>
#include <cassert>
#include <sstream>
#include <libxml/tree.h>
#include <libxml/HTMLparser.h>
#include <libxml/xpath.h>
#include <libxml/xpathInternals.h>
#include "html-document.h"
#include "html-document-exception.h"
#include "stream-tokenizer.h"
using namespace std;
static const int kHTMLParseFlags =
HTML_PARSE_NOBLANKS | HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET;
static const string kDelimiters = " \t\n\r\b!@#$%^&*()_-+=~`{[}]|\\\"':;<,>.?/";
void HTMLDocument::parse() {
htmlDocPtr doc = htmlReadFile(url.c_str(), /* encoding = */ NULL, kHTMLParseFlags);
if (doc == NULL) {
// This is the only real user error we handle with any frequency, as it's
// completely reasonable that the client more than occasionally specify a bogus URL.
ostringstream oss;
oss << "Error: unable to parse the document at \"" << url << "\".";
throw HTMLDocumentException(oss.str());
}
xmlXPathContextPtr context = xmlXPathNewContext(doc);
const xmlChar *expr = BAD_CAST "//body";
xmlXPathObjectPtr bodies = xmlXPathEvalExpression(expr, context);
xmlNodeSetPtr bodyNodes = bodies->nodesetval;
int numBodyTags = bodyNodes != NULL ? bodyNodes->nodeNr : 0;
for (int i = 0; i < numBodyTags; i++) { // should only be one body tag, but whatever
xmlChar *rawContent = xmlNodeGetContent(bodyNodes->nodeTab[i]);
string bodyContent = (const char *) rawContent;
xmlFree(rawContent);
istringstream iss(bodyContent);
StreamTokenizer st(iss, kDelimiters, /* skipDelimiters = */ true);
while (st.hasMoreTokens()) {
string token = st.nextToken();
tokens.push_back(token);
}
}
xmlXPathFreeObject(bodies);
xmlXPathFreeContext(context);
xmlFreeDoc(doc);
}