From 45b4e45d603901b098a988c089f5cc28f6a1c479 Mon Sep 17 00:00:00 2001 From: MoonLL Date: Thu, 22 Jan 2026 15:20:12 +0800 Subject: [PATCH] =?UTF-8?q?[222=5F33]=20=E5=A2=9E=E5=BC=BAhtml=E7=9A=84?= =?UTF-8?q?=E6=A0=BC=E5=BC=8F=E6=A3=80=E6=B5=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- TeXmacs/plugins/html/progs/data/html.scm | 241 +++++++++++++ TeXmacs/tests/222_34.scm | 430 +++++++++++++++++++++++ devel/222_34.md | 15 + 3 files changed, 686 insertions(+) create mode 100644 TeXmacs/tests/222_34.scm create mode 100644 devel/222_34.md diff --git a/TeXmacs/plugins/html/progs/data/html.scm b/TeXmacs/plugins/html/progs/data/html.scm index 307c436241..48fdcf45d8 100644 --- a/TeXmacs/plugins/html/progs/data/html.scm +++ b/TeXmacs/plugins/html/progs/data/html.scm @@ -17,6 +17,220 @@ ;; Html ;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; +;; 按行分割文本 +(define (html-string-split-lines s) + (let ((len (if (>= (string-length s) 1000) 1000 (string-length s)))) + (let loop ((i 0) + (start 0) + (result '())) + (cond ((>= i len) + (reverse (cons (substring s start i) result))) + ((char=? (string-ref s i) #\newline) + (loop (+ i 1) + (+ i 1) + (cons (substring s start i) result))) + (else (loop (+ i 1) start result)))))) + +;; 某个字符在文本中的含量 +(define (charactor-from-string s ch) + (if (not (string-null? s)) + (let* ((len (string-length s)) + (limit (if (>= len 1000) 1000 len))) + (let loop ((ref 0) + (count 0)) + (if (>= ref limit) + (/ count len) + (loop (+ ref 1) + (if (char=? (string-ref s ref) ch) + (+ count 1) + count))))) + #f)) + +;; 计算一个子串在文本中的含量,计算的是子串的字符数,而不是个数 +(define (html-string-count-substring s sub) + (let ((sub-len (string-length sub))) + (if (zero? sub-len) + 0 + (let loop ((i 0) + (count 0)) + (if (>= i (- (string-length s) sub-len -1)) + count + (if (string=? (substring s i (+ i sub-len)) sub) + (loop (+ i sub-len) (+ count 1)) + (loop (+ i 1) count))))))) + +;; < 和 > 的含量 +(define (html-angle-bracket-density s) + (if (string-null? s) + 0 + (let* ((len (string-length s)) + (limit (if (>= len 1000) 1000 len)) + (substr (substring s 0 limit))) + (/ (+ (charactor-from-string substr #\<) + (charactor-from-string substr #\>)) + len)))) + +;; 完整的tag子串在文本中的字符含量 +(define (html-tag-density s) + (if (string-null? s) + 0 + (let* ((len (string-length s)) + (limit (if (>= len 1000) 1000 len)) + (substr (substring s 0 limit)) + (lc-substr (string-downcase substr))) + (let ((count (+ (html-string-count-substring lc-substr "= len 1000) 1000 len)) + (substr (substring s 0 limit))) + (/ (+ (charactor-from-string substr #\=) + (charactor-from-string substr #\")) + len)))) + +;; 这一行文本是否包含html标签 +(define (html-line-contains-features? line) + (let ((lc-line (string-downcase line))) + (or + (> (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line "/>") 0) + (> (html-string-count-substring lc-line " (html-string-count-substring lc-line " total 0) (/ count total) 0) + (let ((line (car remaining))) + (loop (cdr remaining) + (if (html-line-contains-features? line) (+ count 1) count) + (+ total 1)))))))) + +;; 计算div标签的平衡性 +(define (html-structure-balanced? s) + (let* ((lc-s (string-downcase s)) + (open-tags (html-string-count-substring lc-s " open-tags 0) (> close-tags 0) (<= (abs (- open-tags close-tags)) 2)))) + +;; 短字符串的特殊检测 +(define (determine-short-html-string s) + (let* ((len (string-length s))) + (cond + ((or + (and (> (charactor-from-string s #\<) 0) + (> (charactor-from-string s #\>) 0) + (> (html-string-count-substring s " (html-string-count-substring (string-downcase s) "class=") 0) + (> (html-string-count-substring (string-downcase s) "id=") 0) + (> (html-string-count-substring (string-downcase s) "style=") 0) + (> (html-string-count-substring (string-downcase s) "href=") 0) + (> (html-string-count-substring (string-downcase s) "src=") 0)) + #t) + ((>= (html-angle-bracket-density s) 0.03) #t) + (else #f)))) + +(define (is-short-html-string? s) + (if (<= (string-length s) 100) + (determine-short-html-string s) + #f)) + + (define (is-html-string? s) + (let* ((angle-density (html-angle-bracket-density s)) + (tag-density (html-tag-density s)) + (attr-density (html-attribute-density s)) + (feature-line-density (html-feature-line-density s)) + (balanced? (html-structure-balanced? s))) + (cond + ;; High confidence: clear HTML structure + ;; < > 含量,标签含量,特征行含量 + ((and (>= angle-density 0.02) + (>= tag-density 0.01) + (>= feature-line-density 0.25)) + #t) + ;; Medium confidence: good angle bracket density with either tags or attributes + ;; + ((and (>= angle-density 0.015) + (or (>= tag-density 0.005) + (>= attr-density 0.01)) + (>= feature-line-density 0.15)) + #t) + ;; Lower confidence: balanced structure with some HTML features + ((and balanced? + (>= angle-density 0.01) + (>= feature-line-density 0.10)) + #t) + ;; Very high angle bracket density (likely HTML/XML) + ((>= angle-density 0.03) #t) + (else #f)))) + (define (html-recognizes-at? s pos) (set! pos (format-skip-spaces s pos)) (cond ((format-test? s pos "") #t) + ((format-test? s pos ". +;; +;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;;; + +(import (liii check)) + +(check-set-mode! 'report-failed) + +;; ============================================================================ +;; The dependent function +;; ============================================================================ + +;; 按行分割文本 +(define (html-string-split-lines s) + (let ((len (if (>= (string-length s) 1000) 1000 (string-length s)))) + (let loop ((i 0) + (start 0) + (result '())) + (cond ((>= i len) + (reverse (cons (substring s start i) result))) + ((char=? (string-ref s i) #\newline) + (loop (+ i 1) + (+ i 1) + (cons (substring s start i) result))) + (else (loop (+ i 1) start result)))))) + +;; 某个字符在文本中的含量 +(define (charactor-from-string s ch) + (if (not (string-null? s)) + (let* ((len (string-length s)) + (limit (if (>= len 1000) 1000 len))) + (let loop ((ref 0) + (count 0)) + (if (>= ref limit) + (/ count len) + (loop (+ ref 1) + (if (char=? (string-ref s ref) ch) + (+ count 1) + count))))) + #f)) + +;; 计算一个子串在文本中的含量,计算的是子串的字符数,而不是个数 +(define (html-string-count-substring s sub) + (let ((sub-len (string-length sub))) + (if (zero? sub-len) + 0 + (let loop ((i 0) + (count 0)) + (if (>= i (- (string-length s) sub-len -1)) + count + (if (string=? (substring s i (+ i sub-len)) sub) + (loop (+ i sub-len) (+ count 1)) + (loop (+ i 1) count))))))) + +;; < 和 > 的含量 +(define (html-angle-bracket-density s) + (if (string-null? s) + 0 + (let* ((len (string-length s)) + (limit (if (>= len 1000) 1000 len)) + (substr (substring s 0 limit))) + (/ (+ (charactor-from-string substr #\<) + (charactor-from-string substr #\>)) + len)))) + +;; 完整的tag子串在文本中的字符含量 +(define (html-tag-density s) + (if (string-null? s) + 0 + (let* ((len (string-length s)) + (limit (if (>= len 1000) 1000 len)) + (substr (substring s 0 limit)) + (lc-substr (string-downcase substr))) + (let ((count (+ (html-string-count-substring lc-substr "= len 1000) 1000 len)) + (substr (substring s 0 limit))) + (/ (+ (charactor-from-string substr #\=) + (charactor-from-string substr #\")) + len)))) + +;; 这一行文本是否包含html标签 +(define (html-line-contains-features? line) + (let ((lc-line (string-downcase line))) + (or + (> (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line " (html-string-count-substring lc-line "/>") 0) + (> (html-string-count-substring lc-line " (html-string-count-substring lc-line " total 0) (/ count total) 0) + (let ((line (car remaining))) + (loop (cdr remaining) + (if (html-line-contains-features? line) (+ count 1) count) + (+ total 1)))))))) + +;; 计算div标签的平衡性 +(define (html-structure-balanced? s) + (let* ((lc-s (string-downcase s)) + (open-tags (html-string-count-substring lc-s " open-tags 0) (> close-tags 0) (<= (abs (- open-tags close-tags)) 2)))) + +;; 短字符串的特殊检测 +(define (determine-short-html-string s) + (let* ((len (string-length s))) + (cond + ((or + (and (> (charactor-from-string s #\<) 0) + (> (charactor-from-string s #\>) 0) + (> (html-string-count-substring s " (html-string-count-substring (string-downcase s) "class=") 0) + (> (html-string-count-substring (string-downcase s) "id=") 0) + (> (html-string-count-substring (string-downcase s) "style=") 0) + (> (html-string-count-substring (string-downcase s) "href=") 0) + (> (html-string-count-substring (string-downcase s) "src=") 0)) + #t) + ((>= (html-angle-bracket-density s) 0.03) #t) + (else #f)))) + +(define (is-short-html-string? s) + (if (<= (string-length s) 100) + (determine-short-html-string s) + #f)) + + (define (is-html-string? s) + (let* ((angle-density (html-angle-bracket-density s)) + (tag-density (html-tag-density s)) + (attr-density (html-attribute-density s)) + (feature-line-density (html-feature-line-density s)) + (balanced? (html-structure-balanced? s))) + (cond + ;; High confidence: clear HTML structure + ;; < > 含量,标签含量,特征行含量 + ((and (>= angle-density 0.02) + (>= tag-density 0.01) + (>= feature-line-density 0.25)) + #t) + ;; Medium confidence: good angle bracket density with either tags or attributes + ;; + ((and (>= angle-density 0.015) + (or (>= tag-density 0.005) + (>= attr-density 0.01)) + (>= feature-line-density 0.15)) + #t) + ;; Lower confidence: balanced structure with some HTML features + ((and balanced? + (>= angle-density 0.01) + (>= feature-line-density 0.10)) + #t) + ;; Very high angle bracket density (likely HTML/XML) + ((>= angle-density 0.03) #t) + (else #f)))) + +(define (html-recognizes-at? s pos) + (set! pos (format-skip-spaces s pos)) + (cond ((format-test? s pos "") #t) + ((format-test? s pos "\n\n\n Test Page\n\n\n

Hello World

\n

This is a test +paragraph.

\n\n") + +(define html-text2 "\n\n\n\n XHTML Document\n\n\n

This is +XHTML.

\n\n") + +;; HTML fragments +(define html-text3 "
\n

Section Title

\n

Some content here.

\n
    \n
  • Item 1
  • \n
  • Item 2
  • \n
  • Item 3
  • \n +
\n
") + +(define html-text4 "\n \n \n \n \n \n \n \n \n
Header 1Header 2
Cell 1Cell 2
") + +(define html-text5 "
\n \n \n
\n \n
") + +;; HTML with inline styles and scripts +(define html-text6 "\n") + +;; Short HTML snippets +(define html-text7 "

This is a paragraph.

") + +(define html-text8 "Click here") + +(define html-text9 "\"Sample") + +(define html-text10 "Red text") + +;; HTML with MathML +(define html-text11 "\n \n x\n =\n \n \n -\n +b\n ±\n \n \n \n b\n 2\n \n +-\n 4\n a\n c\n \n \n \n \n 2\n + a\n \n \n \n") + +;; HTML with mixed content +(define html-text12 "
\n

Mixed Content

\n

This paragraph contains bold text and italic text.

\n

Here's a link and an \"icon\" image.

\n
") + +;; HTML with comments +(define html-text13 "\n
\n \n

Visible content

\n \n
") + +;; HTML with data attributes +(define html-text14 "
\n Custom widget\n
") + +;; HTML with aria attributes +(define html-text15 "") + +;; Should NOT be detected as HTML + +;; Plain text +(define non-html-text1 "This is plain text without any HTML tags.") + +(define non-html-text2 "Hello, world! This is a simple sentence.") + +;; Markdown text +(define non-html-text3 "# Markdown Title\n\nThis is a paragraph in Markdown.\n\n- List item 1\n- List item 2\n- List item 3") + +(define non-html-text4 "**Bold text** and *italic text* with `inline code`.") + +;; LaTeX text +(define non-html-text5 "\\documentclass{article}\n\\begin{document}\n\\section{Introduction}\nThis is a LaTeX document.\n\\end{document}") + +;; JSON text +(define non-html-text6 "{\n \"name\": \"John Doe\",\n \"age\": 30,\n \"city\": \"New York\"\n}") + +;; XML (non-HTML) +(define non-html-text7 "\n\n \n localhost\n 8080\n \n") + +;; Code (Python) +(define non-html-text8 "def hello_world():\n print(\"Hello, World!\")\n return True") + +;; Code (JavaScript) +(define non-html-text9 "function calculateSum(a, b) {\n return a + b;\n}\n\nconsole.log(calculateSum(5, 3));") + +;; CSV data +(define non-html-text10 "Name,Age,City\nJohn,30,New York\nJane,25,London\nBob,35,Tokyo") + +;; Email addresses and URLs (without tags) +(define non-html-text11 "Contact us at info@example.com or visit https://example.com") + +;; File paths +(define non-html-text12 "C:\\Users\\Name\\Documents\\file.txt\n/home/user/projects/src/main.py") + +;; Edge cases + +;; Text with angle brackets but not HTML +(define non-html-text13 "x < y and y > z") ; Mathematical inequalities + +(define non-html-text14 "5 < 10 > 3") ; More inequalities + +;; Text with quotes and equals but not HTML +(define non-html-text15 "name=\"John\" age=30 city=\"NYC\"") ; Looks like attributes but no tags + +;; Text with very low HTML feature density +(define non-html-text16 "This is a long text document with many paragraphs. It contains some special characters like < and > and = and \" but they are not used in HTML +context. The document continues for many lines to ensure it's long enough for statistical analysis.") + + + +;; ============================================================================ +;; Test function +;; ============================================================================ + +(define (test-html-format-determine) + + ;; Should be detected as HTML + (display "Testing HTML detection (should return #t):\n") + (check (html-recognizes-at? html-text1 0) => #t) + (check (html-recognizes-at? html-text2 0) => #t) + (check (html-recognizes-at? html-text3 0) => #t) + (check (html-recognizes-at? html-text4 0) => #t) + (check (html-recognizes-at? html-text5 0) => #t) + (check (html-recognizes-at? html-text6 0) => #t) + (check (html-recognizes-at? html-text7 0) => #t) + (check (html-recognizes-at? html-text8 0) => #t) + (check (html-recognizes-at? html-text9 0) => #t) + (check (html-recognizes-at? html-text10 0) => #t) + (check (html-recognizes-at? html-text11 0) => #t) + (check (html-recognizes-at? html-text12 0) => #t) + (check (html-recognizes-at? html-text13 0) => #t) + (check (html-recognizes-at? html-text14 0) => #t) + (check (html-recognizes-at? html-text15 0) => #t) + + ;; Should NOT be detected as HTML + (display "\nTesting non-HTML detection (should return #f):\n") + (check (html-recognizes-at? non-html-text1 0) => #f) + (check (html-recognizes-at? non-html-text2 0) => #f) + (check (html-recognizes-at? non-html-text3 0) => #f) + (check (html-recognizes-at? non-html-text4 0) => #f) + (check (html-recognizes-at? non-html-text5 0) => #f) + (check (html-recognizes-at? non-html-text6 0) => #f) + (check (html-recognizes-at? non-html-text7 0) => #f) + (check (html-recognizes-at? non-html-text8 0) => #f) + (check (html-recognizes-at? non-html-text9 0) => #f) + (check (html-recognizes-at? non-html-text10 0) => #f) + (check (html-recognizes-at? non-html-text11 0) => #f) + (check (html-recognizes-at? non-html-text12 0) => #f) + (check (html-recognizes-at? non-html-text13 0) => #f) + (check (html-recognizes-at? non-html-text14 0) => #f) + (check (html-recognizes-at? non-html-text15 0) => #f) + (check (html-recognizes-at? non-html-text16 0) => #f)) + +(tm-define (test_222_33) + (test-html-format-determine) + (check-report)) + diff --git a/devel/222_34.md b/devel/222_34.md new file mode 100644 index 0000000000..e488b0a6bf --- /dev/null +++ b/devel/222_34.md @@ -0,0 +1,15 @@ +# [222_34] 增强html的格式检测 + +## 如何测试 + +```shell +bin/test_only 222_34 +``` + +## 2026/01/22 增强html的格式检测 +1. < > 含量检测 +2. = " 含量检测 +3. 常见html标签检测 +4. 有html特征的行的检测 +5. div标签的平衡性检测 +6. 短文本检测