Skip to content

Commit

Permalink
Fix html content not implemented
Browse files Browse the repository at this point in the history
  • Loading branch information
nbittich committed Dec 10, 2024
1 parent 96071f4 commit 6c6a340
Show file tree
Hide file tree
Showing 14 changed files with 206 additions and 27 deletions.
21 changes: 21 additions & 0 deletions .nvim-dap.lua
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
local dap = require("dap")

dap.adapters.lldb = {
type = "executable",
command = "/usr/bin/lldb-vscode-14",
name = "lldb",
}

dap.configurations.rust = {
{
name = "lib-rdfa",
type = "lldb",
request = "launch",
program = function()
local test_binary = vim.fn.input("Path to test binary: ", vim.fn.getcwd() .. "/target/debug/", "file")
return test_binary
end,
cwd = "${workspaceFolder}/lib-rdfa",
stopOnEntry = false,
},
}
17 changes: 17 additions & 0 deletions lib-rdfa/examples/other/example0006.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<title>HTMLLiteral RDFa Example</title>
</head>
<body vocab="http://schema.org/" typeof="CreativeWork">
<div
typeof="schema:Text"
property="schema:description"
datatype="http://www.w3.org/1999/02/22-rdf-syntax-ns#HTML"
>
<p>This is a <strong>bold</strong> statement within an HTMLLiteral!</p>
<p>RDFa supports <em>rich content</em> extraction.</p>
</div>
</body>
</html>
7 changes: 7 additions & 0 deletions lib-rdfa/examples/other/example0006.ttl
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
<http://rdfa.info/test-suite/test-cases/rdfa1.1/html5/> <http://www.w3.org/ns/rdfa#usesVocabulary> <http://schema.org/>;
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/CreativeWork> .
_:1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Text>;
<http://schema.org/description> """
<p>This is a <strong>bold</strong> statement within an HTMLLiteral!</p>
<p>RDFa supports <em>rich content</em> extraction.</p>
"""^^<http://www.w3.org/1999/02/22-rdf-syntax-ns#HTML> .
17 changes: 17 additions & 0 deletions lib-rdfa/examples/other/example0007.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<title>HTMLLiteral RDFa Example</title>
</head>
<body vocab="http://schema.org/" typeof="CreativeWork">
<div
typeof="schema:Text"
property="schema:description"
datatype="http://www.w3.org/1999/02/22-rdf-syntax-ns#PlainLiteral"
>
<p>This is a <strong>bold</strong> statement within an HTMLLiteral!</p>
<p>RDFa supports <em>rich content</em> extraction.</p>
</div>
</body>
</html>
8 changes: 8 additions & 0 deletions lib-rdfa/examples/other/example0007.ttl
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@

<http://rdfa.info/test-suite/test-cases/rdfa1.1/html5/> <http://www.w3.org/ns/rdfa#usesVocabulary> <http://schema.org/>;
<http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/CreativeWork> .
_:1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Text>;
<http://schema.org/description> """
This is a bold statement within an HTMLLiteral!
RDFa supports rich content extraction.
"""@en .
17 changes: 17 additions & 0 deletions lib-rdfa/examples/other/example0008.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<title>HTMLLiteral RDFa Example</title>
</head>
<body vocab="http://schema.org/" typeof="CreativeWork" href="http://x.com/z">
<div
typeof="schema:Text"
property="schema:description"
datatype="http://www.w3.org/1999/02/22-rdf-syntax-ns#PlainLiteral"
>
<p>This is a <strong>bold</strong> statement within an HTMLLiteral!</p>
<p>RDFa supports <em>rich content</em> extraction.</p>
</div>
</body>
</html>
8 changes: 8 additions & 0 deletions lib-rdfa/examples/other/example0008.ttl
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@

<http://rdfa.info/test-suite/test-cases/rdfa1.1/html5/> <http://www.w3.org/ns/rdfa#usesVocabulary> <http://schema.org/> .
<http://x.com/z> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/CreativeWork> .
_:1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Text>;
<http://schema.org/description> """
This is a bold statement within an HTMLLiteral!
RDFa supports rich content extraction.
"""@en .
17 changes: 17 additions & 0 deletions lib-rdfa/examples/other/example0009.html
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
<!doctype html>
<html lang="en">
<head>
<meta charset="UTF-8" />
<title>HTMLLiteral RDFa Example</title>
</head>
<body vocab="http://schema.org/" typeof="CreativeWork" href="http://x.com/z">
<div
typeof="schema:Text"
property="schema:description"
datatype="http://www.w3.org/1999/02/22-rdf-syntax-ns#HTML"
>
<p>This is a <strong>bold</strong> statement within an HTMLLiteral!</p>
<p>RDFa supports <em>rich content</em> extraction.</p>
</div>
</body>
</html>
8 changes: 8 additions & 0 deletions lib-rdfa/examples/other/example0009.ttl
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@

<http://rdfa.info/test-suite/test-cases/rdfa1.1/html5/> <http://www.w3.org/ns/rdfa#usesVocabulary> <http://schema.org/> .
<http://x.com/z> <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/CreativeWork> .
_:1 <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> <http://schema.org/Text>;
<http://schema.org/description> """
<p>This is a <strong>bold</strong> statement within an HTMLLiteral!</p>
<p>RDFa supports <em>rich content</em> extraction.</p>
"""^^<http://www.w3.org/1999/02/22-rdf-syntax-ns#HTML> .
4 changes: 4 additions & 0 deletions lib-rdfa/src/constants.rs
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ pub static RDFA_COPY_PREDICATE: &str = "http://www.w3.org/ns/rdfa#copy";
pub static RDFA_PATTERN_TYPE: &str = "http://www.w3.org/ns/rdfa#Pattern";
pub static RDFA_USES_VOCABULARY: &str = "http://www.w3.org/ns/rdfa#usesVocabulary";
pub static RDF_XML_LITERAL: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#XMLLiteral";
pub static RDF_HTML_LITERAL: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#HTML";
pub static RDF_PLAIN_LITERAL: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#PlainLiteral";
pub static RDF_XSD_STRING: &str = "http://www.w3.org/2001/XMLSchema#string";
pub static NS_TYPE: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type";
pub static RDF_FIRST: &str = "http://www.w3.org/1999/02/22-rdf-syntax-ns#first";
Expand Down Expand Up @@ -68,6 +70,8 @@ pub static DATETIME_TYPES: [&DataTypeFromPattern; 6] = [

lazy_static::lazy_static! {
pub static ref NODE_RDF_XML_LITERAL: Node<'static> = Node::Iri(Cow::Borrowed(RDF_XML_LITERAL));
pub static ref NODE_RDF_PLAIN_LITERAL: Node<'static> = Node::Iri(Cow::Borrowed(RDF_PLAIN_LITERAL));
pub static ref NODE_RDF_HTML_LITERAL: Node<'static> = Node::Iri(Cow::Borrowed(RDF_HTML_LITERAL));
pub static ref NODE_RDF_FIRST: Node<'static> = Node::Iri(Cow::Borrowed(RDF_FIRST));
pub static ref NODE_RDF_REST: Node<'static> = Node::Iri(Cow::Borrowed(RDF_REST));
pub static ref NODE_RDF_NIL: Node<'static> = Node::Iri(Cow::Borrowed(RDF_NIL));
Expand Down
103 changes: 77 additions & 26 deletions lib-rdfa/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ mod tests;

use constants::{
get_uuid, COMMON_PREFIXES, NODE_NS_TYPE, NODE_RDFA_PATTERN_TYPE, NODE_RDFA_USES_VOCABULARY,
RESERVED_KEYWORDS,
NODE_RDF_HTML_LITERAL, NODE_RDF_PLAIN_LITERAL, RESERVED_KEYWORDS,
};
use itertools::Itertools;
use log::{debug, error};
Expand Down Expand Up @@ -125,7 +125,10 @@ fn traverse_element<'a, 'b>(
.filter(|r| !is_empty_curie(r))
.map(|c| if c.is_empty() { ctx.base } else { c });

ctx.lang = elt.lang.or_else(|| parent.and_then(|p| p.lang));
ctx.lang = elt
.lang
.or_else(|| parent.and_then(|p| p.lang))
.or(ctx.lang);

let mut about = elt.about.and_then(|a| resolve_uri(a, &ctx, true).ok());

Expand All @@ -151,6 +154,25 @@ fn traverse_element<'a, 'b>(
}
});

let datatype = elt
.datatype
.and_then(|dt| match resolve_uri(dt, &ctx, false) {
Ok(d) => Some(Box::new(d)),
Err(e) => {
debug!("could not parse {dt}. error {e}");
None
}
});
let is_special_node = |datatype: &Option<Box<Node<'_>>>| {
datatype
.as_ref()
.filter(|dt| {
dt.as_ref() == &*NODE_RDF_HTML_LITERAL
|| dt.as_ref() == &*NODE_RDF_XML_LITERAL
|| dt.as_ref() == &*NODE_RDF_PLAIN_LITERAL
})
.is_some()
};
let predicates = elt
.property
.map(|p| parse_property_or_type_of(p, &ctx, false));
Expand All @@ -168,8 +190,13 @@ fn traverse_element<'a, 'b>(
.ok_or("no parent")
};

// by default, current node set as the base
let mut current_node = base.clone();
// by default, current node set as the base unless it's a special node
// check other/example0006 for special node
let mut current_node = if !is_special_node(&datatype) {
base.clone()
} else {
make_bnode()
};

// if parent is inlist
if let Some(parent_in_list) = parent_in_list.take() {
Expand All @@ -181,7 +208,7 @@ fn traverse_element<'a, 'b>(
{
resource
} else {
Node::Ref(Arc::new(extract_literal(&elt, &ctx)?))
Node::Ref(Arc::new(extract_literal(&elt, &datatype, &ctx)?))
};
for rel in parent_in_list {
push_triples_inlist(in_list_stmts, &subject, rel, &obj);
Expand Down Expand Up @@ -216,7 +243,7 @@ fn traverse_element<'a, 'b>(
{
resource
} else {
Node::Ref(Arc::new(extract_literal(&elt, &ctx)?))
Node::Ref(Arc::new(extract_literal(&elt, &datatype, &ctx)?))
};
for rel in rels {
push_triples_inlist(in_list_stmts, &subject, rel, &obj);
Expand All @@ -226,7 +253,7 @@ fn traverse_element<'a, 'b>(
let obj = if let (Some(resource), false) = (resource, in_rel) {
Node::Ref(Arc::new(resolve_uri(resource, &ctx, true)?))
} else {
Node::Ref(Arc::new(extract_literal(&elt, &ctx)?))
Node::Ref(Arc::new(extract_literal(&elt, &datatype, &ctx)?))
};

for predicate in predicates {
Expand Down Expand Up @@ -276,7 +303,7 @@ fn traverse_element<'a, 'b>(
stmts,
&current_node,
&predicates,
&Node::Ref(Arc::new(extract_literal(&elt, &ctx)?)),
&Node::Ref(Arc::new(extract_literal(&elt, &datatype, &ctx)?)),
);

if let Some(src_or_href) = src_or_href.take() {
Expand All @@ -294,10 +321,10 @@ fn traverse_element<'a, 'b>(
stmts,
&current_node,
&predicates,
&extract_literal(&elt, &ctx)?,
&extract_literal(&elt, &datatype, &ctx)?,
);
}
// test 0303, this becomes dumber and dumber
// test 0303
else if src_or_href.is_some() && (rels.is_some() || revs.is_some()) {
let src_or_href = src_or_href.take().ok_or("no src")?;
current_node = get_parent_subject(&ctx).ok().unwrap_or_else(make_bnode);
Expand Down Expand Up @@ -345,7 +372,7 @@ fn traverse_element<'a, 'b>(
stmts,
&current_node,
&predicates,
&extract_literal(&elt, &ctx)?,
&extract_literal(&elt, &datatype, &ctx)?,
);
}
}
Expand All @@ -371,9 +398,19 @@ fn traverse_element<'a, 'b>(
})
}
push_triples(stmts, &base, &rels.take(), &current_node);
} else {
} else if !is_special_node(&datatype) {
let child_with_rdfa_tag = element_ref
.select(&Selector::parse("[href], [src], [resource], [property]")?)
.filter(|e| {
RdfaElement::new(e)
.ok()
.and_then(|e2| e2.datatype)
.and_then(|dt| match resolve_uri(dt, &ctx, false).ok().map(Box::new) {
v @ Some(_) if is_special_node(&v) => v,
_ => None,
})
.is_none()
})
.count()
== 0;
current_node = if let Some(src_or_href) = src_or_href.take() {
Expand All @@ -387,6 +424,14 @@ fn traverse_element<'a, 'b>(
let subject = get_parent_subject(&ctx).ok().unwrap_or_else(make_bnode);

push_triples(stmts, &subject, &predicates, &current_node);
} else {
// test examples/other/example0006.html
push_triples(
stmts,
&current_node,
&predicates,
&extract_literal(&elt, &datatype, &ctx)?,
);
}
}
// another general case
Expand All @@ -401,7 +446,7 @@ fn traverse_element<'a, 'b>(
stmts,
&current_node,
&predicates,
&Node::Ref(Arc::new(extract_literal(&elt, &ctx)?)),
&Node::Ref(Arc::new(extract_literal(&elt, &datatype, &ctx)?)),
);
}

Expand Down Expand Up @@ -452,6 +497,7 @@ fn traverse_element<'a, 'b>(
}
let child_ctx = Context {
base: ctx.base,
lang: ctx.lang,
empty_ref_node_substitute: ctx.empty_ref_node_substitute,
..Default::default()
};
Expand All @@ -467,47 +513,52 @@ fn traverse_element<'a, 'b>(
}
fn extract_literal<'a>(
rdfa_el: &RdfaElement<'a, '_>,
datatype: &Option<Box<Node<'a>>>,
ctx: &Context<'a>,
) -> Result<Node<'a>, &'static str> {
let datatype = rdfa_el
.datatype
.and_then(|dt| match resolve_uri(dt, ctx, false) {
Ok(d) => Some(Box::new(d)),
Err(e) => {
debug!("could not parse {dt}. error {e}");
None
}
});
let lang = ctx.lang.filter(|s| datatype.is_none() && !s.is_empty());
let plain_datatype = datatype
.as_ref()
.filter(|dt| dt.as_ref() == &*NODE_RDF_PLAIN_LITERAL)
.is_some();

let lang = ctx.lang.filter(|s| datatype.is_none() && !s.is_empty());
if let Some(value) = rdfa_el.src_or_href().filter(|_| {
!rdfa_el.has_about() && !rdfa_el.has_property() || !rdfa_el.has_content_or_datatype()
}) {
resolve_uri(value, ctx, true)
} else if let Some(content) = rdfa_el.content {
Ok(Node::Literal(Literal {
datatype,
datatype: datatype.clone(),
value: Cow::Borrowed(content),
lang,
}))
} else if datatype
.as_ref()
.filter(|dt| dt.as_ref() == &*NODE_RDF_XML_LITERAL)
.filter(|dt| {
dt.as_ref() == &*NODE_RDF_XML_LITERAL || dt.as_ref() == &*NODE_RDF_HTML_LITERAL
})
.is_some()
{
Ok(Node::Literal(Literal {
value: Cow::Owned(rdfa_el.element_ref.inner_html()),
datatype,
datatype: datatype.clone(),
lang: None,
}))
} else if let Some(content) = rdfa_el.get_time() {
Ok(Node::Literal(Literal {
datatype: datatype
.clone()
.or_else(|| DataTypeFromPattern::date_time_from_pattern(content).map(Box::new)),
value: Cow::Borrowed(content),
lang: None,
}))
} else {
let datatype = if plain_datatype {
None
} else {
datatype.clone()
};
let lang = if plain_datatype { ctx.lang } else { lang };
let texts = rdfa_el.texts();
let text = if texts.is_empty() {
Cow::Borrowed("")
Expand Down
1 change: 1 addition & 0 deletions lib-rdfa/src/rdfa_elt.rs
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ impl<'a, 'b> RdfaElement<'a, 'b> {
let prefix = element.attr("prefix");
let resource = element.attr("resource");
let lang = element.attr("lang").or_else(|| element.attr("xml:lang"));
println!("{lang:?}");
let property = element.attr("property");
let rel = element.attr("rel");
let rev = element.attr("rev");
Expand Down
Loading

0 comments on commit 6c6a340

Please sign in to comment.