Skip to content

Commit

Permalink
Merge pull request #61 from niklak/feature/node-normalized-char-count
Browse files Browse the repository at this point in the history
- Implemented NodeRef::normalized_char_count which estimates the number of characters in the text of descendant nodes as if the total string were normalized.
  • Loading branch information
niklak authored Feb 1, 2025
2 parents 473fc5a + 83d5a03 commit b598faf
Show file tree
Hide file tree
Showing 6 changed files with 122 additions and 1 deletion.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,13 @@

All notable changes to the `dom_query` crate will be documented in this file.


## [Unreleased]

### Added
- Implemented `NodeRef::normalized_char_count` which estimates the number of characters in the text of descendant nodes as if the total string were normalized.


## [0.12.0] - 2025-01-16

### Added
Expand Down
1 change: 1 addition & 0 deletions src/dom_tree.rs
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
mod helpers;
mod ops;
mod traversal;
mod tree;
Expand Down
18 changes: 18 additions & 0 deletions src/dom_tree/helpers.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
pub(crate) fn normalized_char_count(text: &str) -> usize {
let mut char_count = 0;
let mut prev_was_whitespace = true;

for c in text.chars() {
if prev_was_whitespace && c.is_whitespace() {
continue;
}
char_count += 1;
prev_was_whitespace = c.is_whitespace();
}

if prev_was_whitespace && char_count > 0 {
char_count -= 1;
}

char_count
}
50 changes: 50 additions & 0 deletions src/dom_tree/ops.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ use std::cell::Ref;

use tendril::StrTendril;

use super::helpers::normalized_char_count;
use super::Tree;

use crate::entities::{into_tendril, wrap_tendril, StrWrap};
use crate::node::child_nodes;
use crate::node::{NodeData, NodeId, TreeNode};
Expand All @@ -12,6 +14,8 @@ pub struct TreeNodeOps {}
impl TreeNodeOps {
/// Collects all text content of a node and its descendants.
///
/// # Arguments
///
/// - `nodes`: a reference to a vector of `TreeNode`s.
/// - `id`: `NodeId` of the element to get the text content from.
///
Expand Down Expand Up @@ -39,6 +43,52 @@ impl TreeNodeOps {
into_tendril(text)
}

/// Traverses the tree and counts all text content of a node and its descendants,
/// but only counting each sequence of whitespace as a single character.
///
/// # Arguments
///
/// - `nodes`: a reference to a vector of `TreeNode`s.
/// - `id`: `NodeId` of the element to get the text content from.
///
/// This function will traverse the tree and count all text content
/// from the node and its descendants.
///
/// It has an advantage over `node.text().split_whitespace().count()`
/// because it doesn't need to collect and consume the text content.
///
/// # Returns
/// The number of characters that would be in the text content if it were normalized,
/// where normalization means treating any sequence of whitespace characters as a single space.
pub fn normalized_char_count(nodes: Ref<Vec<TreeNode>>, id: NodeId) -> usize {
let mut ops = vec![id];
let mut c: usize = 0;
let mut last_was_whitespace = false;

while let Some(id) = ops.pop() {
if let Some(node) = nodes.get(id.value) {
match node.data {
NodeData::Document | NodeData::Fragment | NodeData::Element(_) => {
ops.extend(child_nodes(Ref::clone(&nodes), &id, true));
}
NodeData::Text { ref contents } => {
if last_was_whitespace {
c += 1;
}
c += normalized_char_count(contents);
last_was_whitespace = contents.ends_with(char::is_whitespace);
}

_ => continue,
}
}
}
if last_was_whitespace && c > 0 {
c -= 1;
}
c
}

/// Returns the text of the node without its descendants.
pub fn immediate_text_of(nodes: Ref<Vec<TreeNode>>, id: NodeId) -> StrTendril {
let mut text = StrWrap::new();
Expand Down
17 changes: 17 additions & 0 deletions src/node/node_ref.rs
Original file line number Diff line number Diff line change
Expand Up @@ -677,4 +677,21 @@ impl NodeRef<'_> {
.map(|node_id| NodeRef::new(*node_id, self.tree))
.collect()
}

/// Traverses the tree and counts all text content of a node and its descendants,
/// but only counting each sequence of whitespace as a single character.
///
/// This function will traverse the tree and count all text content
/// from the node and its descendants.
///
/// It has an advantage over `node.text().split_whitespace().count()`
/// because it doesn't need to collect and consume the text content.
///
/// # Returns
/// The number of characters that would be in the text content if it were normalized,
/// where normalization means treating any sequence of whitespace characters as a single space.
pub fn normalized_char_count(&self) -> usize {
let nodes = self.tree.nodes.borrow();
TreeNodeOps::normalized_char_count(nodes, self.id)
}
}
30 changes: 29 additions & 1 deletion tests/node-traversal.rs
Original file line number Diff line number Diff line change
Expand Up @@ -299,4 +299,32 @@ fn test_node_find() {
assert_eq!(len_fin_ne, 0);
let len_sel_ne = doc.select("body td p").length();
assert_eq!(len_sel_ne, 0)
}
}

#[cfg_attr(not(target_arch = "wasm32"), test)]
#[cfg_attr(target_arch = "wasm32", wasm_bindgen_test)]
fn test_node_normalized_char_count() {
let contents: &str = r#"
<div id="main">
A very
messy content
<span>. A something that</span>
<p>
asks to be normalized </p>
</div>
"#;

let doc = Document::from(contents);
let main_sel = doc.select_single("#main");
let main_node = main_sel.nodes().first().unwrap();
let expected = main_node
.text()
.split_whitespace()
.collect::<Vec<&str>>()
.join(" ")
.len();
let got = main_node.normalized_char_count();
assert_eq!(got, expected);
}

0 comments on commit b598faf

Please sign in to comment.