open-tags 0) (> close-tags 0) (<= (abs (- open-tags close-tags)) 2))))
+
+;; 短字符串的特殊检测
+(define (determine-short-html-string s)
+ (let* ((len (string-length s)))
+ (cond
+ ((or
+ (and (> (charactor-from-string s #\<) 0)
+ (> (charactor-from-string s #\>) 0)
+ (> (html-string-count-substring s "") 0))
+ (> (html-string-count-substring (string-downcase s) "class=") 0)
+ (> (html-string-count-substring (string-downcase s) "id=") 0)
+ (> (html-string-count-substring (string-downcase s) "style=") 0)
+ (> (html-string-count-substring (string-downcase s) "href=") 0)
+ (> (html-string-count-substring (string-downcase s) "src=") 0))
+ #t)
+ ((>= (html-angle-bracket-density s) 0.03) #t)
+ (else #f))))
+
+(define (is-short-html-string? s)
+ (if (<= (string-length s) 100)
+ (determine-short-html-string s)
+ #f))
+
+ (define (is-html-string? s)
+ (let* ((angle-density (html-angle-bracket-density s))
+ (tag-density (html-tag-density s))
+ (attr-density (html-attribute-density s))
+ (feature-line-density (html-feature-line-density s))
+ (balanced? (html-structure-balanced? s)))
+ (cond
+ ;; High confidence: clear HTML structure
+ ;; < > 含量,标签含量,特征行含量
+ ((and (>= angle-density 0.02)
+ (>= tag-density 0.01)
+ (>= feature-line-density 0.25))
+ #t)
+ ;; Medium confidence: good angle bracket density with either tags or attributes
+ ;;
+ ((and (>= angle-density 0.015)
+ (or (>= tag-density 0.005)
+ (>= attr-density 0.01))
+ (>= feature-line-density 0.15))
+ #t)
+ ;; Lower confidence: balanced structure with some HTML features
+ ((and balanced?
+ (>= angle-density 0.01)
+ (>= feature-line-density 0.10))
+ #t)
+ ;; Very high angle bracket density (likely HTML/XML)
+ ((>= angle-density 0.03) #t)
+ (else #f))))
+
+(define (html-recognizes-at? s pos)
+ (set! pos (format-skip-spaces s pos))
+ (cond ((format-test? s pos "") #t)
+ ((format-test? s pos "
\n\n\n
Test Page\n\n\n
Hello World
\n
This is a test
+paragraph.
\n\n")
+
+(define html-text2 "\n\n\n\n
XHTML Document\n\n\n
This is
+XHTML.
\n\n")
+
+;; HTML fragments
+(define html-text3 "
\n
Section Title
\n
Some content here.
\n
\n - Item 1
\n - Item 2
\n - Item 3
\n
+
\n
")
+
+(define html-text4 "
\n \n | Header 1 | \n Header 2 | \n
\n \n | Cell 1 | \n Cell 2 | \n
\n
")
+
+(define html-text5 "
")
+
+;; HTML with inline styles and scripts
+(define html-text6 "\n")
+
+;; Short HTML snippets
+(define html-text7 "
This is a paragraph.
")
+
+(define html-text8 "
Click here")
+
+(define html-text9 "

")
+
+(define html-text10 "
Red text")
+
+;; HTML with MathML
+(define html-text11 "
")
+
+;; HTML with mixed content
+(define html-text12 "
\n
Mixed Content
\n
This paragraph contains bold text and italic text.
\n
Here's a link and an
image.
\n
")
+
+;; HTML with comments
+(define html-text13 "\n
\n \n
Visible content
\n \n
")
+
+;; HTML with data attributes
+(define html-text14 "
\n Custom widget\n
")
+
+;; HTML with aria attributes
+(define html-text15 "
")
+
+;; Should NOT be detected as HTML
+
+;; Plain text
+(define non-html-text1 "This is plain text without any HTML tags.")
+
+(define non-html-text2 "Hello, world! This is a simple sentence.")
+
+;; Markdown text
+(define non-html-text3 "# Markdown Title\n\nThis is a paragraph in Markdown.\n\n- List item 1\n- List item 2\n- List item 3")
+
+(define non-html-text4 "**Bold text** and *italic text* with `inline code`.")
+
+;; LaTeX text
+(define non-html-text5 "\\documentclass{article}\n\\begin{document}\n\\section{Introduction}\nThis is a LaTeX document.\n\\end{document}")
+
+;; JSON text
+(define non-html-text6 "{\n \"name\": \"John Doe\",\n \"age\": 30,\n \"city\": \"New York\"\n}")
+
+;; XML (non-HTML)
+(define non-html-text7 "\n
\n \n localhost\n 8080\n \n")
+
+;; Code (Python)
+(define non-html-text8 "def hello_world():\n print(\"Hello, World!\")\n return True")
+
+;; Code (JavaScript)
+(define non-html-text9 "function calculateSum(a, b) {\n return a + b;\n}\n\nconsole.log(calculateSum(5, 3));")
+
+;; CSV data
+(define non-html-text10 "Name,Age,City\nJohn,30,New York\nJane,25,London\nBob,35,Tokyo")
+
+;; Email addresses and URLs (without tags)
+(define non-html-text11 "Contact us at info@example.com or visit https://example.com")
+
+;; File paths
+(define non-html-text12 "C:\\Users\\Name\\Documents\\file.txt\n/home/user/projects/src/main.py")
+
+;; Edge cases
+
+;; Text with angle brackets but not HTML
+(define non-html-text13 "x < y and y > z") ; Mathematical inequalities
+
+(define non-html-text14 "5 < 10 > 3") ; More inequalities
+
+;; Text with quotes and equals but not HTML
+(define non-html-text15 "name=\"John\" age=30 city=\"NYC\"") ; Looks like attributes but no tags
+
+;; Text with very low HTML feature density
+(define non-html-text16 "This is a long text document with many paragraphs. It contains some special characters like < and > and = and \" but they are not used in HTML
+context. The document continues for many lines to ensure it's long enough for statistical analysis.")
+
+
+
+;; ============================================================================
+;; Test function
+;; ============================================================================
+
+(define (test-html-format-determine)
+
+ ;; Should be detected as HTML
+ (display "Testing HTML detection (should return #t):\n")
+ (check (html-recognizes-at? html-text1 0) => #t)
+ (check (html-recognizes-at? html-text2 0) => #t)
+ (check (html-recognizes-at? html-text3 0) => #t)
+ (check (html-recognizes-at? html-text4 0) => #t)
+ (check (html-recognizes-at? html-text5 0) => #t)
+ (check (html-recognizes-at? html-text6 0) => #t)
+ (check (html-recognizes-at? html-text7 0) => #t)
+ (check (html-recognizes-at? html-text8 0) => #t)
+ (check (html-recognizes-at? html-text9 0) => #t)
+ (check (html-recognizes-at? html-text10 0) => #t)
+ (check (html-recognizes-at? html-text11 0) => #t)
+ (check (html-recognizes-at? html-text12 0) => #t)
+ (check (html-recognizes-at? html-text13 0) => #t)
+ (check (html-recognizes-at? html-text14 0) => #t)
+ (check (html-recognizes-at? html-text15 0) => #t)
+
+ ;; Should NOT be detected as HTML
+ (display "\nTesting non-HTML detection (should return #f):\n")
+ (check (html-recognizes-at? non-html-text1 0) => #f)
+ (check (html-recognizes-at? non-html-text2 0) => #f)
+ (check (html-recognizes-at? non-html-text3 0) => #f)
+ (check (html-recognizes-at? non-html-text4 0) => #f)
+ (check (html-recognizes-at? non-html-text5 0) => #f)
+ (check (html-recognizes-at? non-html-text6 0) => #f)
+ (check (html-recognizes-at? non-html-text7 0) => #f)
+ (check (html-recognizes-at? non-html-text8 0) => #f)
+ (check (html-recognizes-at? non-html-text9 0) => #f)
+ (check (html-recognizes-at? non-html-text10 0) => #f)
+ (check (html-recognizes-at? non-html-text11 0) => #f)
+ (check (html-recognizes-at? non-html-text12 0) => #f)
+ (check (html-recognizes-at? non-html-text13 0) => #f)
+ (check (html-recognizes-at? non-html-text14 0) => #f)
+ (check (html-recognizes-at? non-html-text15 0) => #f)
+ (check (html-recognizes-at? non-html-text16 0) => #f))
+
+(tm-define (test_222_33)
+ (test-html-format-determine)
+ (check-report))
+
diff --git a/devel/222_34.md b/devel/222_34.md
new file mode 100644
index 0000000000..e488b0a6bf
--- /dev/null
+++ b/devel/222_34.md
@@ -0,0 +1,15 @@
+# [222_34] 增强html的格式检测
+
+## 如何测试
+
+```shell
+bin/test_only 222_34
+```
+
+## 2026/01/22 增强html的格式检测
+1. < > 含量检测
+2. = " 含量检测
+3. 常见html标签检测
+4. 有html特征的行的检测
+5. div标签的平衡性检测
+6. 短文本检测