-
Notifications
You must be signed in to change notification settings - Fork 0
/
rdf_parser.cpp
200 lines (161 loc) · 5.95 KB
/
rdf_parser.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
#include "rdf_parser.h"
#include <algorithm>
#include <sstream>
// Constructor definition
RDFParser::RDFParser()
: env(serd_env_new(NULL)) {}
// Destructor definition
RDFParser::~RDFParser() {
serd_env_free(env);
}
std::vector<NTriple> RDFParser::parse(const std::string& rml_rule) {
handle_rdf_parsing(rml_rule);
// Extract Blank Node ID
blank_node_id = getHighestBlankNodeID(rml_triples);
return rml_triples;
}
SerdStatus RDFParser::static_handle_error(void* handle, const SerdError* error) {
return ((RDFParser*)handle)->handle_error(handle, error);
}
SerdStatus RDFParser::static_capture_prefix(void* handle, const SerdNode* name, const SerdNode* uri) {
return ((RDFParser*)handle)->capture_prefix(name, uri);
}
SerdStatus RDFParser::static_handle_triple(void* handle, unsigned int flags, const SerdNode* graph, const SerdNode* subject, const SerdNode* predicate, const SerdNode* object, const SerdNode* datatype, const SerdNode* lang) {
return ((RDFParser*)handle)->handle_triple(handle, flags, graph, subject, predicate, object, datatype, lang);
}
// Function to extract the number from a string starting with 'b'
int RDFParser::extractNumber(const std::string& str) {
// Check if the string starts with 'b' and has at least one number following it
if (str.size() > 1 && str[0] == 'b') {
for (size_t i = 1; i < str.size(); ++i) {
// If any character is not a digit, return -1
if (!std::isdigit(str[i])) {
return -1;
}
}
return std::stoi(str.substr(1));
}
return -1;
}
// Function to find the highest blank node id number from NTriple vectors
int RDFParser::getHighestBlankNodeID(const std::vector<NTriple>& triples) {
int maxNumber = -1;
for (const auto& triple : triples) {
// Extract numbers from subject and object
int numSubject = extractNumber(triple.subject);
int numObject = extractNumber(triple.object);
maxNumber = std::max({maxNumber, numSubject, numObject});
}
if (maxNumber == -1) {
return 0;
}
return maxNumber;
}
/**
* Extracts the base URI from a given string.
*
* Iterates over the provided string, searching for a line that starts with `@base`.
* If found, it extracts the URI enclosed between `<` and `>`.
*
* @param {std::string} str - The input string from which to extract the base URI.
* @return {std::string} - The extracted base URI, or an empty string if not found.
*/
std::string RDFParser::extract_base_URI(const std::string& str) {
std::istringstream stream(str); // Convert the string to a stream for line-by-line processing
std::string line;
while (std::getline(stream, line)) {
// Remove leading and trailing whitespace
size_t start = line.find_first_not_of(" \t");
size_t end = line.find_last_not_of(" \t");
if (start != std::string::npos) {
line = line.substr(start, end - start + 1);
} else {
line = "";
}
// Check if line starts with @base
if (line.find("@base") == 0) {
size_t begin = line.find('<');
size_t finish = line.find('>');
if (begin != std::string::npos && finish != std::string::npos) {
return line.substr(begin + 1, finish - begin - 1);
}
}
}
return ""; // Return empty string if @base not found
}
// Error handling function for Serd
SerdStatus RDFParser::handle_error(void* handle, const SerdError* error) {
(void)handle;
std::string error_message = "Runtime error occurred. " + error->status;
throw std::runtime_error(error_message);
return SERD_FAILURE;
}
// Function to add prefix to envionment
SerdStatus RDFParser::capture_prefix(const SerdNode* name, const SerdNode* uri) {
// Set the prefix in the environment
return serd_env_set_prefix(env, name, uri);
}
// Function to expand a serd curie to an uri
SerdNode RDFParser::expand_node(const SerdNode* node) {
SerdNode expanded = serd_env_expand_node(env, node);
if (expanded.buf) {
return expanded;
}
return *node;
}
SerdStatus RDFParser::handle_triple(
void* handle,
unsigned int flags,
const SerdNode* graph,
const SerdNode* subject,
const SerdNode* predicate,
const SerdNode* object,
const SerdNode* datatype,
const SerdNode* lang) {
// Unused parameters
(void)handle;
(void)flags;
(void)graph;
(void)datatype;
(void)lang;
// Create a NTriple struct instance
NTriple triple;
// Expand nodes and directly store in quad
SerdNode expanded_subject = expand_node(subject);
triple.subject = (const char*)expanded_subject.buf;
SerdNode expanded_predicate = expand_node(predicate);
triple.predicate = (const char*)expanded_predicate.buf;
SerdNode expanded_object = expand_node(object);
triple.object = (const char*)expanded_object.buf;
rml_triples.push_back(triple);
return SERD_SUCCESS;
}
void RDFParser::handle_rdf_parsing(const std::string& rdf_data) {
std::vector<NTriple> rml_triples;
// Parse RML Rule
std::string base_uri = extract_base_URI(rdf_data);
this->extracted_base_uri = base_uri;
// Create a SerdNode for the base_uri string
SerdNode baseNode = serd_node_new_uri_from_string((const uint8_t*)base_uri.c_str(), NULL, NULL);
// Set the base URI for the environment
serd_env_set_base_uri(env, &baseNode);
//// Setup serd reader ////
SerdReader* reader = serd_reader_new(
SERD_TURTLE, // Reading Turtle RDF
this, // Handle for your user data
nullptr, // Free function for user data
nullptr, // Base sink
static_capture_prefix, // Prefix sink
static_handle_triple, // Statement sink
nullptr); // End sink
// Set the error handling function for the reader
serd_reader_set_error_sink(reader, static_handle_error, this);
// Parse the data
SerdStatus status = serd_reader_read_string(reader, (const uint8_t*)rdf_data.c_str());
if (status) {
throw std::runtime_error("Runtime error occurred reading RML rule.");
}
// Free memory from Serd
serd_reader_free(reader);
serd_node_free(&baseNode);
}