diff --git a/CHANGELOG.md b/CHANGELOG.md index 41d58ce..1ce8c71 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,13 @@ All notable changes to the `dom_query` crate will be documented in this file. + +## [Unreleased] + +### Added +- Implemented `NodeRef::normalized_char_count` which estimates the number of characters in the text of descendant nodes as if the total string were normalized. + + ## [0.12.0] - 2025-01-16 ### Added diff --git a/src/dom_tree.rs b/src/dom_tree.rs index 7872d59..c8063ec 100644 --- a/src/dom_tree.rs +++ b/src/dom_tree.rs @@ -1,3 +1,4 @@ +mod helpers; mod ops; mod traversal; mod tree; diff --git a/src/dom_tree/helpers.rs b/src/dom_tree/helpers.rs new file mode 100644 index 0000000..7afabcb --- /dev/null +++ b/src/dom_tree/helpers.rs @@ -0,0 +1,18 @@ +pub(crate) fn normalized_char_count(text: &str) -> usize { + let mut char_count = 0; + let mut prev_was_whitespace = true; + + for c in text.chars() { + if prev_was_whitespace && c.is_whitespace() { + continue; + } + char_count += 1; + prev_was_whitespace = c.is_whitespace(); + } + + if prev_was_whitespace && char_count > 0 { + char_count -= 1; + } + + char_count +} diff --git a/src/dom_tree/ops.rs b/src/dom_tree/ops.rs index ac1788c..41312a9 100644 --- a/src/dom_tree/ops.rs +++ b/src/dom_tree/ops.rs @@ -2,7 +2,9 @@ use std::cell::Ref; use tendril::StrTendril; +use super::helpers::normalized_char_count; use super::Tree; + use crate::entities::{into_tendril, wrap_tendril, StrWrap}; use crate::node::child_nodes; use crate::node::{NodeData, NodeId, TreeNode}; @@ -12,6 +14,8 @@ pub struct TreeNodeOps {} impl TreeNodeOps { /// Collects all text content of a node and its descendants. /// + /// # Arguments + /// /// - `nodes`: a reference to a vector of `TreeNode`s. /// - `id`: `NodeId` of the element to get the text content from. /// @@ -39,6 +43,52 @@ impl TreeNodeOps { into_tendril(text) } + /// Traverses the tree and counts all text content of a node and its descendants, + /// but only counting each sequence of whitespace as a single character. + /// + /// # Arguments + /// + /// - `nodes`: a reference to a vector of `TreeNode`s. + /// - `id`: `NodeId` of the element to get the text content from. + /// + /// This function will traverse the tree and count all text content + /// from the node and its descendants. + /// + /// It has an advantage over `node.text().split_whitespace().count()` + /// because it doesn't need to collect and consume the text content. + /// + /// # Returns + /// The number of characters that would be in the text content if it were normalized, + /// where normalization means treating any sequence of whitespace characters as a single space. + pub fn normalized_char_count(nodes: Ref>, id: NodeId) -> usize { + let mut ops = vec![id]; + let mut c: usize = 0; + let mut last_was_whitespace = false; + + while let Some(id) = ops.pop() { + if let Some(node) = nodes.get(id.value) { + match node.data { + NodeData::Document | NodeData::Fragment | NodeData::Element(_) => { + ops.extend(child_nodes(Ref::clone(&nodes), &id, true)); + } + NodeData::Text { ref contents } => { + if last_was_whitespace { + c += 1; + } + c += normalized_char_count(contents); + last_was_whitespace = contents.ends_with(char::is_whitespace); + } + + _ => continue, + } + } + } + if last_was_whitespace && c > 0 { + c -= 1; + } + c + } + /// Returns the text of the node without its descendants. pub fn immediate_text_of(nodes: Ref>, id: NodeId) -> StrTendril { let mut text = StrWrap::new(); diff --git a/src/node/node_ref.rs b/src/node/node_ref.rs index 30f48fa..c73f3fb 100644 --- a/src/node/node_ref.rs +++ b/src/node/node_ref.rs @@ -677,4 +677,21 @@ impl NodeRef<'_> { .map(|node_id| NodeRef::new(*node_id, self.tree)) .collect() } + + /// Traverses the tree and counts all text content of a node and its descendants, + /// but only counting each sequence of whitespace as a single character. + /// + /// This function will traverse the tree and count all text content + /// from the node and its descendants. + /// + /// It has an advantage over `node.text().split_whitespace().count()` + /// because it doesn't need to collect and consume the text content. + /// + /// # Returns + /// The number of characters that would be in the text content if it were normalized, + /// where normalization means treating any sequence of whitespace characters as a single space. + pub fn normalized_char_count(&self) -> usize { + let nodes = self.tree.nodes.borrow(); + TreeNodeOps::normalized_char_count(nodes, self.id) + } } diff --git a/tests/node-traversal.rs b/tests/node-traversal.rs index 4556440..0d03f09 100644 --- a/tests/node-traversal.rs +++ b/tests/node-traversal.rs @@ -299,4 +299,32 @@ fn test_node_find() { assert_eq!(len_fin_ne, 0); let len_sel_ne = doc.select("body td p").length(); assert_eq!(len_sel_ne, 0) -} \ No newline at end of file +} + +#[cfg_attr(not(target_arch = "wasm32"), test)] +#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)] +fn test_node_normalized_char_count() { + let contents: &str = r#" +
+ A very + messy content + . A something that +

+ asks to be normalized

+ + +
+ "#; + + let doc = Document::from(contents); + let main_sel = doc.select_single("#main"); + let main_node = main_sel.nodes().first().unwrap(); + let expected = main_node + .text() + .split_whitespace() + .collect::>() + .join(" ") + .len(); + let got = main_node.normalized_char_count(); + assert_eq!(got, expected); +}