diff --git a/lib/Cargo.toml b/lib/Cargo.toml index 695d503..141d555 100644 --- a/lib/Cargo.toml +++ b/lib/Cargo.toml @@ -8,20 +8,21 @@ repository = { workspace = true } readme = { workspace = true } keywords = { workspace = true } rust-version = { workspace = true } + [dependencies] -zip = "6.0.0" +zip = "7.0.0" anyhow = "1.0.100" -quick-xml = { version = "0.38.3" } +quick-xml = { version = "0.38.4" } ab_glyph = { version = "0.2.32", optional = true } -imageproc = { version = "0.25.0", optional = true} -serde_json = { version = "1.0.145", optional = true } +imageproc = { version = "0.26.0", optional = true} +serde_json = { version = "1.0.149", optional = true } iepub-derive = { path = "../derive", version = "1.2.6" } serde = { version = "1.0.228", features = ["derive"], optional = true } -image = { version = "0.25.8", default-features = false, features = ["jpeg"], optional = true } +image = { version = "0.25.9", default-features = false, features = ["jpeg"], optional = true } md-5 = {version = "0.10.6", optional = true } [dev-dependencies] -reqwest = { version = "0.11", features = ["blocking"] } +reqwest = { version = "0.13.1", features = ["blocking"] } [features] no_nav=[] diff --git a/lib/src/common.rs b/lib/src/common.rs index 96e910c..b5b4f53 100644 --- a/lib/src/common.rs +++ b/lib/src/common.rs @@ -139,6 +139,100 @@ impl From for IError { IError::Utf8(value) } } + +/// 内容类型枚举 +#[derive(Debug, Clone)] +pub enum ContentType { + /// 段落 + Paragraph, + /// 标题 (level: 1-6) + Heading(u8), + /// 图片 + Image, + /// 链接 + Link, + /// 列表项 + ListItem, + /// 引用块 + BlockQuote, + /// 代码块 + CodeBlock, + /// 分隔线 + HorizontalRule, + /// 普通文本 + Text, + /// 其他标签 + Other(String), +} + +/// 解析后的内容项 +#[derive(Debug, Clone)] +pub struct ContentItem { + /// 内容类型 + pub content_type: ContentType, + /// 文本内容 + pub text: String, + /// 属性 (例如图片的 src, 链接的 href 等) + pub attributes: Vec<(String, String)>, + /// 子内容 + pub children: Vec, +} + +impl ContentItem { + pub fn new(content_type: ContentType) -> Self { + Self { + content_type, + text: String::new(), + attributes: Vec::new(), + children: Vec::new(), + } + } + + /// 添加属性 + pub fn add_attribute(&mut self, key: String, value: String) { + self.attributes.push((key, value)); + } + + /// 添加子内容 + pub fn add_child(&mut self, child: ContentItem) { + self.children.push(child); + } + + /// 添加文本 + pub fn add_text(&mut self, text: &str) { + self.text.push_str(text); + } + + /// 格式化输出 + pub fn format(&self, indent: usize) -> String { + let indent_str = " ".repeat(indent); + let mut result = format!("{}[{:?}]", indent_str, self.content_type); + + if !self.text.is_empty() { + result.push_str(&format!(" 文本: \"{}\"", self.text.trim())); + } + + if !self.attributes.is_empty() { + result.push_str(" 属性: {"); + for (i, (key, value)) in self.attributes.iter().enumerate() { + if i > 0 { + result.push_str(", "); + } + result.push_str(&format!("{}: \"{}\"", key, value)); + } + result.push('}'); + } + + result.push('\n'); + + for child in &self.children { + result.push_str(&child.format(indent + 1)); + } + + result + } +} + cache_struct! { #[derive(Debug, Default)] pub(crate) struct BookInfo { diff --git a/lib/src/epub/core.rs b/lib/src/epub/core.rs index c1b3ab6..c6b7f89 100644 --- a/lib/src/epub/core.rs +++ b/lib/src/epub/core.rs @@ -9,6 +9,7 @@ use crate::cache_struct; use crate::common::{escape_xml, urldecode_enhanced, IError, IResult}; use crate::epub::common::LinkRel; use crate::epub::html; +use crate::parser::HtmlParser; crate::cache_enum! { #[derive(Clone)] pub enum Direction { @@ -171,8 +172,11 @@ impl Debug for EpubHtml { } impl EpubHtml { - pub fn string_data(&self) -> String { - if let Some(data) = &self._data { + pub fn string_data(&mut self) -> String { + if self._data.is_none() { + self.data_mut(); + } + if let Some(data) = &mut self._data { String::from_utf8(data.clone()).unwrap_or_else(|_e| String::new()) } else { String::new() @@ -183,6 +187,18 @@ impl EpubHtml { self._data.as_deref() } + pub fn parser(&mut self) -> Option { + let mut obj = None; + let html = self.string_data(); + if !html.is_empty() { + let mut parser = HtmlParser::new(); + if parser.parse(&html).is_ok() { + obj = Some(parser); + } + } + obj + } + pub(crate) fn read_data(&mut self, reader: &mut impl EpubReaderTrait) { let (id, origin) = if let Some(index) = self._file_name.find('#') { ( diff --git a/lib/src/epub/reader.rs b/lib/src/epub/reader.rs index 2aae107..73c4776 100644 --- a/lib/src/epub/reader.rs +++ b/lib/src/epub/reader.rs @@ -1197,7 +1197,6 @@ html assert_ne!(0, nav.len()); assert_ne!("", nav[0].title()); let mut chap = book.chapters_mut(); - assert_eq!(75, chap.next().unwrap().data_mut().unwrap().len()); // println!("{}", String::from_utf8( chap.next().unwrap().data().unwrap().to_vec()).unwrap()); @@ -1208,11 +1207,15 @@ html chap.next().unwrap().data_mut().unwrap().to_vec().len() ); + for i in chap { + if let Some(p) = i.parser() { + assert_ne!(0, p.extract_plain_text().len()); + } + } assert!(book.get_chapter("s04.xhtml#pgepubid00536").is_some()); // assert!(chap.next().is_some()); // chap.next(); // chap.next(); - // assert_ne!("", chap.next().unwrap().title()); // assert_ne!(None, book.chapters_mut().next().unwrap().data()); } diff --git a/lib/src/lib.rs b/lib/src/lib.rs index d740b34..1ba5beb 100644 --- a/lib/src/lib.rs +++ b/lib/src/lib.rs @@ -3,10 +3,11 @@ #[allow(clippy::needless_range_loop)] extern crate iepub_derive; mod adapter; -mod common; +pub mod common; mod cover; mod epub; mod mobi; +pub mod parser; pub mod path; pub use crate::common::DateTimeFormater; diff --git a/lib/src/parser.rs b/lib/src/parser.rs new file mode 100644 index 0000000..5664b6c --- /dev/null +++ b/lib/src/parser.rs @@ -0,0 +1,342 @@ +use crate::common::{ContentItem, ContentType}; +use anyhow::{anyhow, Result}; +use quick_xml::{events::Event, reader::Reader}; + +/// HTML 解析器 +pub struct HtmlParser { + /// 解析结果 + pub items: Vec, +} + +impl HtmlParser { + pub fn new() -> Self { + Self { items: Vec::new() } + } + + /// 解析 HTML 字符串 + pub fn parse(&mut self, html: &str) -> Result<()> { + let mut reader = Reader::from_str(html); + reader.config_mut().trim_text(false); + reader.config_mut().expand_empty_elements = true; + reader.config_mut().check_end_names = false; + + let mut buf = Vec::new(); + let mut stack: Vec = Vec::new(); + let mut in_body = false; + let mut has_body_tag = false; + let mut depth: u32 = 0; // 用于跟踪标签深度 + + loop { + match reader.read_event_into(&mut buf) { + Ok(Event::Eof) => break, + + Ok(Event::Start(ref e)) => { + let tag_name = String::from_utf8_lossy(e.name().as_ref()).to_string(); + depth += 1; + + // 检查是否进入 body 标签 + if tag_name.to_lowercase() == "body" { + in_body = true; + has_body_tag = true; + buf.clear(); + continue; + } + + // 如果没有 body 标签,且不是 html/head 标签,则开始解析 + if !has_body_tag && depth > 0 { + let lower_tag = tag_name.to_lowercase(); + if lower_tag != "html" && lower_tag != "head" && lower_tag != "meta" + && lower_tag != "title" && lower_tag != "link" && lower_tag != "style" { + in_body = true; + } + } + + // 只解析 body 内的内容或无 body 时的内容 + if !in_body { + buf.clear(); + continue; + } + + let content_type = Self::tag_to_content_type(&tag_name); + let mut item = ContentItem::new(content_type); + + // 提取属性 + for attr_result in e.attributes() { + if let Ok(attr) = attr_result { + let key = String::from_utf8_lossy(attr.key.as_ref()).to_string(); + let value = attr + .unescape_value() + .unwrap_or_else(|_| std::borrow::Cow::Borrowed("")) + .to_string(); + item.add_attribute(key, value); + } + } + + stack.push(item); + } + + Ok(Event::End(ref e)) => { + let tag_name = String::from_utf8_lossy(e.name().as_ref()).to_string(); + depth = depth.saturating_sub(1); + + // 检查是否离开 body 标签 + if tag_name.to_lowercase() == "body" { + in_body = false; + buf.clear(); + continue; + } + + if !in_body { + buf.clear(); + continue; + } + + if let Some(item) = stack.pop() { + if let Some(parent) = stack.last_mut() { + parent.add_child(item); + } else { + self.items.push(item); + } + } + } + + Ok(Event::Text(ref e)) => { + if in_body { + // 手动解码文本 + let decoded = String::from_utf8_lossy(e.as_ref()).to_string(); + if !decoded.trim().is_empty() { + if let Some(item) = stack.last_mut() { + item.add_text(&decoded); + } else { + // 如果没有父标签,创建一个文本节点 + let mut text_item = ContentItem::new(ContentType::Text); + text_item.add_text(&decoded); + self.items.push(text_item); + } + } + } + } + + Ok(Event::Empty(ref e)) => { + if !in_body { + buf.clear(); + continue; + } + + let tag_name = String::from_utf8_lossy(e.name().as_ref()).to_string(); + let content_type = Self::tag_to_content_type(&tag_name); + let mut item = ContentItem::new(content_type); + + // 提取属性 (对 img, br, hr 等自闭合标签很重要) + for attr_result in e.attributes() { + if let Ok(attr) = attr_result { + let key = String::from_utf8_lossy(attr.key.as_ref()).to_string(); + let value = attr + .unescape_value() + .unwrap_or_else(|_| std::borrow::Cow::Borrowed("")) + .to_string(); + item.add_attribute(key, value); + } + } + + if let Some(parent) = stack.last_mut() { + parent.add_child(item); + } else { + self.items.push(item); + } + } + + Ok(Event::CData(ref e)) => { + if in_body { + let text = String::from_utf8_lossy(e.as_ref()); + if let Some(item) = stack.last_mut() { + item.add_text(&text); + } + } + } + + Err(e) => { + return Err(anyhow!("解析错误: {:?}", e)); + } + + _ => {} + } + + buf.clear(); + } + + // 处理未关闭的标签 + while let Some(item) = stack.pop() { + if let Some(parent) = stack.last_mut() { + parent.add_child(item); + } else { + self.items.push(item); + } + } + + Ok(()) + } + + /// 将 HTML 标签名转换为内容类型 + fn tag_to_content_type(tag: &str) -> ContentType { + match tag.to_lowercase().as_str() { + "p" => ContentType::Paragraph, + "h1" => ContentType::Heading(1), + "h2" => ContentType::Heading(2), + "h3" => ContentType::Heading(3), + "h4" => ContentType::Heading(4), + "h5" => ContentType::Heading(5), + "h6" => ContentType::Heading(6), + "img" => ContentType::Image, + "a" => ContentType::Link, + "li" => ContentType::ListItem, + "blockquote" => ContentType::BlockQuote, + "pre" | "code" => ContentType::CodeBlock, + "hr" => ContentType::HorizontalRule, + _ => ContentType::Other(tag.to_string()), + } + } + + /// 打印解析结果 + pub fn print_result(&self) { + println!("=== 解析结果 ===\n"); + for (i, item) in self.items.iter().enumerate() { + println!("--- 内容项 {} ---", i + 1); + print!("{}", item.format(0)); + } + } + + /// 提取所有段落文本 + pub fn extract_paragraphs(&self) -> Vec { + let mut paragraphs = Vec::new(); + self.extract_paragraphs_recursive(&self.items, &mut paragraphs); + paragraphs + } + + fn extract_paragraphs_recursive(&self, items: &[ContentItem], result: &mut Vec) { + for item in items { + match item.content_type { + ContentType::Paragraph => { + if !item.text.trim().is_empty() { + result.push(item.text.trim().to_string()); + } + } + _ => {} + } + // 递归处理子元素 + self.extract_paragraphs_recursive(&item.children, result); + } + } + + /// 提取所有标题 + pub fn extract_headings(&self) -> Vec<(u8, String)> { + let mut headings = Vec::new(); + self.extract_headings_recursive(&self.items, &mut headings); + headings + } + + fn extract_headings_recursive(&self, items: &[ContentItem], result: &mut Vec<(u8, String)>) { + for item in items { + if let ContentType::Heading(level) = item.content_type { + if !item.text.trim().is_empty() { + result.push((level, item.text.trim().to_string())); + } + } + // 递归处理子元素 + self.extract_headings_recursive(&item.children, result); + } + } + + /// 提取所有图片链接 + pub fn extract_images(&self) -> Vec { + let mut images = Vec::new(); + self.extract_images_recursive(&self.items, &mut images); + images + } + + fn extract_images_recursive(&self, items: &[ContentItem], result: &mut Vec) { + for item in items { + if let ContentType::Image = item.content_type { + for (key, value) in &item.attributes { + if key.to_lowercase() == "src" { + result.push(value.clone()); + break; + } + } + } + // 递归处理子元素 + self.extract_images_recursive(&item.children, result); + } + } + + /// 获取纯文本内容 + pub fn extract_plain_text(&self) -> String { + let mut text = String::new(); + self.extract_text_recursive(&self.items, &mut text); + text + } + + fn extract_text_recursive(&self, items: &[ContentItem], result: &mut String) { + for item in items { + if !item.text.is_empty() { + result.push_str(item.text.trim()); + result.push(' '); + } + // 递归处理子元素 + self.extract_text_recursive(&item.children, result); + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_parse_html_without_body_tag() { + // 测试没有 body 标签的 HTML 片段 + let html = r#"
171
+

INTRODUCTORY

+

The difficulties of classification are very apparent here.

+

Another paragraph with some text.

"#; + + let mut parser = HtmlParser::new(); + parser.parse(html).unwrap(); + + println!("解析到 {} 个顶层元素", parser.items.len()); + assert!(parser.items.len() > 0, "应该解析到至少一个元素"); + + let paragraphs = parser.extract_paragraphs(); + println!("提取到 {} 个段落", paragraphs.len()); + assert_eq!(paragraphs.len(), 2, "应该提取到 2 个段落"); + + let headings = parser.extract_headings(); + println!("提取到 {} 个标题", headings.len()); + assert_eq!(headings.len(), 1, "应该提取到 1 个标题"); + assert_eq!(headings[0].0, 3, "标题级别应该是 3"); + assert_eq!(headings[0].1, "INTRODUCTORY", "标题内容应该是 INTRODUCTORY"); + } + + #[test] + fn test_parse_html_with_body_tag() { + // 测试有 body 标签的 HTML + let html = r#" + +

章节标题

+

这是第一段内容。

+ +"#; + + let mut parser = HtmlParser::new(); + parser.parse(html).unwrap(); + + assert!(parser.items.len() > 0); + + let paragraphs = parser.extract_paragraphs(); + assert_eq!(paragraphs.len(), 1); + + let headings = parser.extract_headings(); + assert_eq!(headings.len(), 1); + assert_eq!(headings[0].0, 1); + } +}