Merge pull request #61 from niklak/feature/node-normalized-char-count

- Implemented NodeRef::normalized_char_count which estimates the number of characters in the text of descendant nodes as if the total string were normalized.
niklak · Feb 1, 2025 · b598faf · b598faf
2 parents 473fc5a + 83d5a03
commit b598faf
Show file tree

Hide file tree

Showing 6 changed files with 122 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,13 @@
 
 All notable changes to the `dom_query` crate will be documented in this file.
 
+
+## [Unreleased]
+
+### Added
+- Implemented `NodeRef::normalized_char_count` which estimates the number of characters in the text of descendant nodes as if the total string were normalized.
+
+
 ## [0.12.0] - 2025-01-16
 
 ### Added

diff --git a/src/dom_tree.rs b/src/dom_tree.rs
@@ -1,3 +1,4 @@
+mod helpers;
 mod ops;
 mod traversal;
 mod tree;

diff --git a/src/dom_tree/helpers.rs b/src/dom_tree/helpers.rs
@@ -0,0 +1,18 @@
+pub(crate) fn normalized_char_count(text: &str) -> usize {
+    let mut char_count = 0;
+    let mut prev_was_whitespace = true;
+
+    for c in text.chars() {
+        if prev_was_whitespace && c.is_whitespace() {
+            continue;
+        }
+        char_count += 1;
+        prev_was_whitespace = c.is_whitespace();
+    }
+
+    if prev_was_whitespace && char_count > 0 {
+        char_count -= 1;
+    }
+
+    char_count
+}
diff --git a/src/dom_tree/ops.rs b/src/dom_tree/ops.rs
@@ -2,7 +2,9 @@ use std::cell::Ref;
 
 use tendril::StrTendril;
 
+use super::helpers::normalized_char_count;
 use super::Tree;
+
 use crate::entities::{into_tendril, wrap_tendril, StrWrap};
 use crate::node::child_nodes;
 use crate::node::{NodeData, NodeId, TreeNode};
@@ -12,6 +14,8 @@ pub struct TreeNodeOps {}
 impl TreeNodeOps {
     /// Collects all text content of a node and its descendants.
     ///
+    /// # Arguments
+    ///
     /// - `nodes`: a reference to a vector of `TreeNode`s.
     /// - `id`: `NodeId` of the element to get the text content from.
     ///
@@ -39,6 +43,52 @@ impl TreeNodeOps {
         into_tendril(text)
     }
 
+    /// Traverses the tree and counts all text content of a node and its descendants,
+    /// but only counting each sequence of whitespace as a single character.
+    ///
+    /// # Arguments
+    ///
+    /// - `nodes`: a reference to a vector of `TreeNode`s.
+    /// - `id`: `NodeId` of the element to get the text content from.
+    ///
+    /// This function will traverse the tree and count all text content
+    /// from the node and its descendants.
+    ///
+    /// It has an advantage over `node.text().split_whitespace().count()`
+    /// because it doesn't need to collect and consume the text content.
+    ///
+    /// # Returns
+    /// The number of characters that would be in the text content if it were normalized,
+    /// where normalization means treating any sequence of whitespace characters as a single space.
+    pub fn normalized_char_count(nodes: Ref<Vec<TreeNode>>, id: NodeId) -> usize {
+        let mut ops = vec![id];
+        let mut c: usize = 0;
+        let mut last_was_whitespace = false;
+
+        while let Some(id) = ops.pop() {
+            if let Some(node) = nodes.get(id.value) {
+                match node.data {
+                    NodeData::Document | NodeData::Fragment | NodeData::Element(_) => {
+                        ops.extend(child_nodes(Ref::clone(&nodes), &id, true));
+                    }
+                    NodeData::Text { ref contents } => {
+                        if last_was_whitespace {
+                            c += 1;
+                        }
+                        c += normalized_char_count(contents);
+                        last_was_whitespace = contents.ends_with(char::is_whitespace);
+                    }
+
+                    _ => continue,
+                }
+            }
+        }
+        if last_was_whitespace && c > 0 {
+            c -= 1;
+        }
+        c
+    }
+
     /// Returns the text of the node without its descendants.
     pub fn immediate_text_of(nodes: Ref<Vec<TreeNode>>, id: NodeId) -> StrTendril {
         let mut text = StrWrap::new();

diff --git a/src/node/node_ref.rs b/src/node/node_ref.rs
@@ -677,4 +677,21 @@ impl NodeRef<'_> {
             .map(|node_id| NodeRef::new(*node_id, self.tree))
             .collect()
     }
+
+    /// Traverses the tree and counts all text content of a node and its descendants,
+    /// but only counting each sequence of whitespace as a single character.
+    ///
+    /// This function will traverse the tree and count all text content
+    /// from the node and its descendants.
+    ///
+    /// It has an advantage over `node.text().split_whitespace().count()`
+    /// because it doesn't need to collect and consume the text content.
+    ///
+    /// # Returns
+    /// The number of characters that would be in the text content if it were normalized,
+    /// where normalization means treating any sequence of whitespace characters as a single space.
+    pub fn normalized_char_count(&self) -> usize {
+        let nodes = self.tree.nodes.borrow();
+        TreeNodeOps::normalized_char_count(nodes, self.id)
+    }
 }
diff --git a/tests/node-traversal.rs b/tests/node-traversal.rs
@@ -299,4 +299,32 @@ fn test_node_find() {
     assert_eq!(len_fin_ne, 0);
     let len_sel_ne = doc.select("body td p").length();
     assert_eq!(len_sel_ne, 0)
-}
+}
+
+#[cfg_attr(not(target_arch = "wasm32"), test)]
+#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
+fn test_node_normalized_char_count() {
+    let contents: &str = r#"
+        <div id="main">
+        A           very 
+                                messy content
+            <span>. A something       that</span>
+            <p>
+            asks to be     normalized     </p>
+
+
+        </div>
+    "#;
+
+    let doc = Document::from(contents);
+    let main_sel = doc.select_single("#main");
+    let main_node = main_sel.nodes().first().unwrap();
+    let expected = main_node
+        .text()
+        .split_whitespace()
+        .collect::<Vec<&str>>()
+        .join(" ")
+        .len();
+    let got = main_node.normalized_char_count();
+    assert_eq!(got, expected);
+}