Skip to content

Commit 9e28247

Browse files
committed
src/node/serializing.rs: implement format_text
1 parent b598faf commit 9e28247

File tree

9 files changed

+239
-4
lines changed

9 files changed

+239
-4
lines changed

CHANGELOG.md

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ All notable changes to the `dom_query` crate will be documented in this file.
77

88
### Added
99
- Implemented `NodeRef::normalized_char_count` which estimates the number of characters in the text of descendant nodes as if the total string were normalized.
10-
10+
- Implemented `Document::formatted_text`, `Selection::formatted_text`, and `NodeRef::formatted_text`, which return formatted text of the document, selection, or node respectively.
1111

1212
## [0.12.0] - 2025-01-16
1313

src/document.rs

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -117,6 +117,17 @@ impl Document {
117117
self.root().text()
118118
}
119119

120+
/// Returns the formatted text of the document and its descendants. This is the same as
121+
/// the `text()` method, but with a few differences:
122+
///
123+
/// - Whitespace is normalized so that there is only one space between words.
124+
/// - All whitespace is removed from the beginning and end of the text.
125+
/// - After block elements, a double newline is added.
126+
/// - For elements like `br`, 'hr', 'li', 'tr' a single newline is added.
127+
pub fn formatted_text(&self) -> StrTendril {
128+
self.root().formatted_text()
129+
}
130+
120131
/// Finds the base URI of the tree by looking for `<base>` tags in document's head.
121132
///
122133
/// The base URI is the value of the `href` attribute of the first

src/node.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ pub use iters::{
1515
};
1616
pub use node_data::{Element, NodeData};
1717
pub use node_ref::{Node, NodeRef};
18+
pub(crate) use serializing::format_text;
1819
pub use serializing::SerializableNodeRef;
1920

2021
/// Represents a Node ID.

src/node/node_ref.rs

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ use super::child_nodes;
2121
use super::id_provider::NodeIdProver;
2222
use super::inner::TreeNode;
2323
use super::node_data::NodeData;
24+
use super::serializing::format_text;
2425
use super::serializing::SerializableNodeRef;
2526
use super::NodeId;
2627

@@ -552,6 +553,18 @@ impl NodeRef<'_> {
552553
TreeNodeOps::immediate_text_of(nodes, self.id)
553554
}
554555

556+
/// Returns the formatted text of the node and its descendants. This is the same as
557+
/// the `text()` method, but with a few differences:
558+
///
559+
/// - Whitespace is normalized so that there is only one space between words.
560+
/// - All whitespace is removed from the beginning and end of the text.
561+
/// - After block elements, a double newline is added.
562+
/// - For elements like `br`, 'hr', 'li', 'tr' a single newline is added.
563+
564+
pub fn formatted_text(&self) -> StrTendril {
565+
format_text(self, false)
566+
}
567+
555568
/// Checks if the node contains the specified text
556569
pub fn has_text(&self, needle: &str) -> bool {
557570
let mut ops = vec![self.id];

src/node/serializing.rs

Lines changed: 124 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,11 @@ use std::io;
33

44
use html5ever::serialize::TraversalScope;
55
use html5ever::serialize::{Serialize, Serializer};
6-
use html5ever::QualName;
6+
7+
use html5ever::{local_name, QualName};
8+
use tendril::StrTendril;
9+
10+
use crate::TreeNodeOps;
711

812
use super::node_data::NodeData;
913
use super::node_ref::NodeRef;
@@ -82,3 +86,122 @@ impl Serialize for SerializableNodeRef<'_> {
8286
Ok(())
8387
}
8488
}
89+
90+
pub(crate) fn format_text(root_node: &NodeRef, include_node: bool) -> StrTendril {
91+
let id = root_node.id;
92+
let nodes = root_node.tree.nodes.borrow();
93+
let mut ops = if include_node {
94+
vec![SerializeOp::Open(id)]
95+
} else {
96+
child_nodes(Ref::clone(&nodes), &id, true)
97+
.map(SerializeOp::Open)
98+
.collect()
99+
};
100+
101+
let mut text = StrTendril::new();
102+
103+
while let Some(op) = ops.pop() {
104+
match op {
105+
SerializeOp::Open(id) => {
106+
let node = match nodes.get(id.value) {
107+
Some(node) => node,
108+
None => continue,
109+
};
110+
111+
match node.data {
112+
NodeData::Text { ref contents } => {
113+
if contents.is_empty() {
114+
continue;
115+
}
116+
let follows_newline = text.ends_with('\n') || text.is_empty();
117+
let normalized = normalize_text(contents.as_ref(), follows_newline);
118+
text.push_tendril(&normalized);
119+
}
120+
NodeData::Element(ref e) => {
121+
ops.push(SerializeOp::Close(&e.name));
122+
123+
if matches!(e.name.local, local_name!("pre")) {
124+
text.push_tendril(&TreeNodeOps::text_of(Ref::clone(&nodes), id));
125+
continue;
126+
}
127+
128+
ops.extend(
129+
child_nodes(Ref::clone(&nodes), &id, true).map(SerializeOp::Open),
130+
);
131+
}
132+
NodeData::Document | NodeData::Fragment => {
133+
// Push children in reverse order
134+
ops.extend(
135+
child_nodes(Ref::clone(&nodes), &id, true).map(SerializeOp::Open),
136+
);
137+
continue;
138+
}
139+
_ => {}
140+
}
141+
}
142+
SerializeOp::Close(name) => {
143+
if text.ends_with("\n\n") {
144+
continue;
145+
}
146+
if matches!(
147+
name.local,
148+
local_name!("article")
149+
| local_name!("blockquote")
150+
| local_name!("section")
151+
| local_name!("div")
152+
| local_name!("p")
153+
| local_name!("pre")
154+
| local_name!("h1")
155+
| local_name!("h2")
156+
| local_name!("h3")
157+
| local_name!("h4")
158+
| local_name!("h5")
159+
| local_name!("h6")
160+
| local_name!("ul")
161+
| local_name!("ol")
162+
| local_name!("dl")
163+
| local_name!("table")
164+
) {
165+
text.push_slice("\n\n");
166+
} else if matches!(
167+
name.local,
168+
local_name!("br") | local_name!("hr") | local_name!("li") | local_name!("tr")
169+
) {
170+
text.push_char('\n');
171+
}
172+
}
173+
}
174+
}
175+
if !include_node {
176+
while !text.is_empty() && text.ends_with(char::is_whitespace) {
177+
text.pop_back(1);
178+
}
179+
}
180+
text
181+
}
182+
183+
fn normalize_text(text: &str, follows_newline: bool) -> StrTendril {
184+
let push_start_whitespace = !follows_newline && text.starts_with(char::is_whitespace);
185+
let push_end_whitespace = text.ends_with(char::is_whitespace);
186+
187+
let mut result = StrTendril::with_capacity(text.len() as u32);
188+
let mut iter = text.split_whitespace();
189+
190+
if let Some(first) = iter.next() {
191+
if push_start_whitespace {
192+
result.push_char(' ');
193+
}
194+
result.push_slice(first);
195+
for word in iter {
196+
result.push_char(' ');
197+
result.push_slice(word);
198+
}
199+
}
200+
if result.is_empty() {
201+
return result;
202+
}
203+
if push_end_whitespace {
204+
result.push_char(' ');
205+
}
206+
result
207+
}

src/selection.rs

Lines changed: 16 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ use tendril::StrTendril;
77

88
use crate::document::Document;
99
use crate::matcher::{MatchScope, Matcher, Matches};
10-
use crate::node::{ancestor_nodes, child_nodes, NodeId, NodeRef, TreeNode};
10+
use crate::node::{ancestor_nodes, child_nodes, format_text, NodeId, NodeRef, TreeNode};
1111
use crate::{Tree, TreeNodeOps};
1212

1313
/// Selection represents a collection of nodes matching some criteria. The
@@ -176,6 +176,21 @@ impl Selection<'_> {
176176
pub fn immediate_text(&self) -> StrTendril {
177177
self.text_fn(TreeNodeOps::immediate_text_of)
178178
}
179+
180+
/// Returns the formatted text of the selected nodes and their descendants.
181+
/// This is the same as the `text()` method, but with a few differences:
182+
///
183+
/// - Whitespace is normalized so that there is only one space between words.
184+
/// - All whitespace is removed from the beginning and end of the text.
185+
/// - After block elements, a double newline is added.
186+
/// - For elements like `br`, 'hr', 'li', 'tr' a single newline is added.
187+
pub fn formatted_text(&self) -> StrTendril {
188+
let mut s = StrTendril::new();
189+
for node in self.nodes() {
190+
s.push_tendril(&format_text(node, true));
191+
}
192+
s
193+
}
179194
}
180195

181196
//matching methods

tests/data.rs

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -104,3 +104,25 @@ pub static ATTRS_CONTENTS: &str = r#"<!DOCTYPE html>
104104
</div>
105105
</body>
106106
</html>"#;
107+
108+
pub static DMC_CONTENTS: &str = r#"<!DOCTYPE html>
109+
<html>
110+
<head></head>
111+
<body>
112+
<div id="main">
113+
<div>
114+
<p>Listen up y'all, it's time to get down</p>
115+
<p>'Bout that <b>normalized_char_count</b> in this town</p>
116+
<p>Traversing nodes with style and grace</p>
117+
<p>Counting chars at a steady pace</p>
118+
</div>
119+
120+
<div>
121+
<p>No split whitespace, that's old school</p>
122+
<p>Direct counting's our golden rule</p>
123+
<p>Skip them nodes that ain't text or element</p>
124+
<p>That's how we keep our code development!</p>
125+
</div>
126+
</div>
127+
</body>
128+
</html>"#;

tests/node-traversal.rs

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
mod data;
22

3-
use data::ANCESTORS_CONTENTS;
3+
use data::{ANCESTORS_CONTENTS, DMC_CONTENTS};
44
use dom_query::{Document, NodeData, Selection};
55

66
#[cfg(target_arch = "wasm32")]
@@ -328,3 +328,26 @@ fn test_node_normalized_char_count() {
328328
let got = main_node.normalized_char_count();
329329
assert_eq!(got, expected);
330330
}
331+
332+
#[cfg_attr(not(target_arch = "wasm32"), test)]
333+
#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
334+
fn test_doc_formatted_text() {
335+
let doc = Document::from(DMC_CONTENTS);
336+
let text = doc.formatted_text();
337+
let expected = r#"Listen up y'all, it's time to get down
338+
339+
'Bout that normalized_char_count in this town
340+
341+
Traversing nodes with style and grace
342+
343+
Counting chars at a steady pace
344+
345+
No split whitespace, that's old school
346+
347+
Direct counting's our golden rule
348+
349+
Skip them nodes that ain't text or element
350+
351+
That's how we keep our code development!"#;
352+
assert_eq!(text.as_ref(), expected);
353+
}

tests/selection-traversal.rs

Lines changed: 27 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ mod data;
22

33
use data::doc;
44
use data::doc_wiki;
5+
use data::DMC_CONTENTS;
56
use data::{ANCESTORS_CONTENTS, LIST_CONTENTS};
67
use dom_query::Document;
78

@@ -522,3 +523,29 @@ fn test_selection_get_node() {
522523
let third = sel.get(2);
523524
assert!(third.is_none());
524525
}
526+
527+
#[cfg_attr(not(target_arch = "wasm32"), test)]
528+
#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
529+
fn test_selection_formatted_text() {
530+
let doc = Document::from(DMC_CONTENTS);
531+
let sel = doc.select("p");
532+
let text = sel.formatted_text();
533+
let expected = r#"Listen up y'all, it's time to get down
534+
535+
'Bout that normalized_char_count in this town
536+
537+
Traversing nodes with style and grace
538+
539+
Counting chars at a steady pace
540+
541+
No split whitespace, that's old school
542+
543+
Direct counting's our golden rule
544+
545+
Skip them nodes that ain't text or element
546+
547+
That's how we keep our code development!
548+
549+
"#;
550+
assert_eq!(text.as_ref(), expected);
551+
}

0 commit comments

Comments
 (0)