font-specific encodings should be detected as windows-1252 by sniffer

wakaba · wakaba · commit 574e9ba09973 · 2025-11-08T20:22:57.000+09:00
diff --git a/lib/Web/Encoding/Sniffer.pm b/lib/Web/Encoding/Sniffer.pm
@@ -5,13 +5,14 @@ our $VERSION = '1.0';
 use Web::Encoding;
 
 ## context
+##   any          - any content (not implemented by browsers)
 ##   html         - HTML (navigate)
 ##   responsehtml - HTML (responseXML)
 ##   xml          - XML (navigate, responseXML, responseText)
-##   css          - CSS
+##   css          - CSS external style sheet
 ##   text         - text (navigate)
-##   responsetext - non-XML (responseText)
-##   classicscript - <script src> with type "classic"
+##   responsetext - non-XML text (responseText)
+##   classicscript - JavaScript (<script src> with type "classic")
 sub new_from_context ($$) {
   return bless {
     context => $_[1],
@@ -26,6 +27,10 @@ sub encoding ($) {
   return $_[0]->{encoding};
 } # encoding
 
+sub font_encoding ($) {
+  return $_[0]->{font_encoding};
+} # font_encoding
+
 sub source ($) {
   return $_[0]->{source};
 } # source
@@ -211,6 +216,7 @@ sub _prescan_byte_stream ($) {
 ## locale    - user's locale's language tag in lowercase or undef
 sub detect ($$;%) {
   my ($self, undef, %args) = @_;
+  delete $self->{font_encoding};
 
   ## BOM
   if ($_[1] =~ /^\xFE\xFF/) {
@@ -255,7 +261,8 @@ sub detect ($$;%) {
     ## Prescan xml
     if ($self->{context} eq 'html' or
         $self->{context} eq 'responsehtml' or
-        $self->{context} eq 'xml') {
+        $self->{context} eq 'xml' or
+        $self->{context} eq 'any') {
       my $name = _prescan_xml $_[1];
       if (defined $name) {
         $self->{encoding} = $name;
@@ -271,7 +278,8 @@ sub detect ($$;%) {
 
     ## Prescan html
     if ($self->{context} eq 'html' or
-        $self->{context} eq 'responsehtml') {
+        $self->{context} eq 'responsehtml' or
+        $self->{context} eq 'any') {
       my $name = _prescan_byte_stream $_[1];
       if (defined $name) {
         $self->{encoding} = $name;
@@ -285,7 +293,8 @@ sub detect ($$;%) {
       }
     }
 
-    if ($self->{context} eq 'css') {
+    if ($self->{context} eq 'css' or
+        $self->{context} eq 'any') {
       ## <https://drafts.csswg.org/css-syntax/#determine-the-fallback-encoding>
       if ($_[1] =~ /\A\x40\x63\x68\x61\x72\x73\x65\x74\x20\x22([\x00-\x21\x23-\x7F]*)\x22\x3B/) {
         my $name = encoding_label_to_name $1;
@@ -318,17 +327,37 @@ sub detect ($$;%) {
       return;
     }
 
-    if ($self->{context} eq 'html' or $self->{context} eq 'text') {
-      ## UNIVCHARDET
-      require Web::Encoding::UnivCharDet;
-      my $det = Web::Encoding::UnivCharDet->new;
-      # XXX locale-dependent configuration
-      my $name = encoding_label_to_name $det->detect_byte_string ($_[1]);
-      if ($name) {
-        $self->{encoding} = $name;
-        delete $self->{confident};
-        $self->{source} = 'univchardet';
-        return;
+    if ($self->{context} eq 'html' or
+        $self->{context} eq 'text' or
+        $self->{context} eq 'any') {
+      ## Implementation-dependent detections
+      {
+        my $font_def;
+        if ($self->{context} eq 'html' or $self->{context} eq 'any') {
+          $font_def = $self->_detect_font ($_[1]); # or undef
+        }
+      
+        ## UNIVCHARDET
+        require Web::Encoding::UnivCharDet;
+        my $det = Web::Encoding::UnivCharDet->new;
+        # XXX locale-dependent configuration
+        my $got = $det->detect_byte_string ($_[1]);
+        my $name = encoding_label_to_name $got;
+        if (defined $font_def) {
+          if (not defined $name or not $name eq 'utf-8') {
+            $self->{encoding} = 'windows-1252';
+            delete $self->{confident};
+            $self->{source} = 'font';
+            $self->{font_encoding} = $font_def->{charset};
+            return;
+          }
+        }
+        if (defined $name and not $got eq 'ascii') {
+          $self->{encoding} = $name;
+          delete $self->{confident};
+          $self->{source} = 'univchardet';
+          return;
+        }
       }
 
       ## Locale
@@ -363,11 +392,176 @@ sub detect ($$;%) {
   return;
 } # detect
 
+# XXX
+my $FontDefs = {
+  "limon s1" => {charset => 'x-abc'},
+  aniezhai => {charset => "x-aniezhai"},
+  "adarshalipiexp" => {charset => "x-adarshalipiexp"},
+  "adhawin-tamil" => {charset => "x-adhawin"},
+  "adhawin-tamil regular" => {charset => "x-adhawin"},
+  adhawintamil => {charset => "x-adhawin"},
+  amudham2000 => {charset => "x-amudham2000"},
+  "arial am" => {charset => "armscii-8"},
+  "arial latarm" => {charset => "armscii-8"},
+  au => {charset => "x-au"},
+  bhaskar => {charset => "bhaskar"},
+  chanakya => {charset => "x-chanakya"},
+  eenadu => {charset => "x-eenadu"},
+  epatrika => {charset => "x-epatrika"},
+  rswwwnet => {charset => "georgian-academy"},
+  trg1 => {charset => "georgian-academy"},
+  "bpg classic dina" => {charset => "georgian-academy"},
+  gopika => {charset => "x-gopika"},
+  htchanakya => {charset => "htchanakya"},
+  inaimathi => {charset => "x-inaimathi"},
+  "inaimathi-1.8" => {charset => "x-inaimathi"},
+  jagran => {charset => "jagran"},
+  "ml-ttkarthika" => {charset => "x-karthika"},
+  lokweb => {charset => "x-lokweb"},
+  "lt-tm-barani" => {charset => "x-tam-lttmbarani"},
+  "mac c swiss" => {charset => "x-mac-c-swiss"},
+  "knw-ttnandi" => {charset => "x-nandi"},
+  "utopic" => {charset => "x-utopic"},
+  "unq_ttabid" => {charset => "x-pascii"},
+  pothana => {charset => "x-pothana"},
+  "sanskrit new" => {charset => "x-sanskrit-new"},
+  "or-ttsarala" => {charset => "x-sarala"},
+  "shree-mal-0502" => {charset => "x-shree-mal-0502"},
+  "shree-tel-0900" => {charset => "x-shree-tel-0900"},
+  shree802 => {charset => "x-shree802"},
+  "subak-1" => {charset => "x-subak"},
+  "dv-ttsurekh" => {charset => "x-surekh"},
+  suritlr => {charset => "x-suritlr"},
+  suritlk => {charset => "x-suritlk"},
+  webtamil => {charset => "x-tam-webtamil"},
+  "telugu lipi" => {charset => "x-telugu-lipi"},
+  thoolika => {charset => "x-thoolika"},
+  tikkana => {charset => "x-tikkana"},
+  tboomis => {charset => "x-tam-tboomis"},
+  tboomih => {charset => "x-tam-tboomis"},
+  tboomi => {charset => "x-tam-tboomis"},
+  tmnews => {charset => "x-tam-tmnews"},
+  telugufont => {charset => "x-telugufont"},
+  "tab-anna" => {charset => "tab"},
+  "tab_inaimathi" => {charset => "tab"},
+  "tab-lfs-kamban" => {charset => "tab"},
+  'tam-kalaignar' => {charset => "tam"},
+  "tsc_janani" => {charset => "tscii"},
+  "thunaivantsc" => {charset => "tscii"},
+  "tscsaiindira" => {charset => "tscii"},
+  "tscsaisai" => {charset => "tscii"},
+  "tscarial" => {charset => "tscii"},
+  "tsccomic" => {charset => "tscii"},
+  "tscmylai" => {charset => "tscii"},
+  "tsctimes" => {charset => "tscii"},
+  "tscverdana" => {charset => "tscii"},
+  "tsc_avarangal" => {charset => "tscii"},
+  "tsc_avarangalfxd" => {charset => "tscii"},
+  "tsc_kannadaasan" => {charset => "tscii"},
+  "tsc_paranar" => {charset => "tscii"},
+  "tsc_thunaivan" => {charset => "tscii"},
+  "tsc-sri" => {charset => "tscii"},
+  tscu_inaimathi => {charset => "tscii"},
+  inaimathitsc => {charset => "tscii"},
+  tneritsc => {charset => "tscii"},
+  "perathanaitsc" => {charset => "tscii"},
+  "aparanartsc" => {charset => "tscii"},
+  "comictsc" => {charset => "tscii"},
+  "maduramtsc" => {charset => "tscii"},
+  "mylaifixtsc" => {charset => "tscii"},
+  "mylaitsc" => {charset => "tscii"},
+  "nanthinitsc" => {charset => "tscii"},
+  "sri-tsc" => {charset => "tscii"},
+  "timestsc" => {charset => "tscii"},
+  "tneritsc" => {charset => "tscii"},
+  "tamil_avarangal31tsc" => {charset => "tscii"},
+  shivaji01 => {charset => "x-shivaji01"},
+  vakil_01 => {charset => "x-vakil_01"},
+  ".vntime" => {charset => "x-viet-tcvn"},
+  "vntime" => {charset => "x-viet-tcvn"},
+  "vni-aptima" => {charset => "x-viet-vni"},
+  "vni-helve" => {charset => "x-viet-vni"},
+  "vni-times" => {charset => "x-viet-vni"},
+  "vni-internet mail" => {charset => "x-viet-vni"},
+  "vni couri" => {charset => "x-viet-vni"},
+  "vps times" => {charset => "x-viet-vps"},
+  vikatan => {charset => "x-vikatan"},
+  webdunia => {charset => "x-webdunia"},
+  xdvng => {charset => 'x-xdvng'},
+};
+
+sub _detect_font ($$) {
+  my $self = shift;
+  $self->{fonts} = {};
+
+  # 1.
+  (pos $_[0]) = 0;
+
+  my $count = 0;
+  # 2.
+  LOOP: {
+    $_[0] =~ /\G<!--+>/gc;
+    $_[0] =~ /\G<!--.*?-->/gcs;
+    if ($_[0] =~ /\G<[Ff][Oo][Nn][Tt](?=[\x09\x0A\x0C\x0D\x20\x2F])/gc) {
+      # 1.
+      #
+
+      # 2.-5.
+      my $attr_list = {};
+
+      # 6.
+      ATTRS: {
+        my $attr = _get_attr ($_[0]) or last ATTRS;
+
+        # 7.
+        redo ATTRS if $attr_list->{$attr->{name}};
+        
+        # 8.
+        $attr_list->{$attr->{name}} = $attr;
+
+        # 9.
+        if ($attr->{name} eq 'face') {
+          my $attr_value = $attr->{value};
+          $attr_value =~ s/\A[\x09\x0A\x0C\x0D\x20]+//;
+          $attr_value =~ s/[\x09\x0A\x0C\x0D\x20]+\z//;
+          $attr_value =~ tr/A-Z/a-z/;
+          $self->{fonts}->{$_}++ for split /[\x09\x0A\x0C\x0D\x20]*,[\x09\x0A\x0C\x0D\x20]*/, $attr_value;
+          last LOOP if $count++ > 10;
+        }
+
+        # 10.
+        last LOOP if pos $_[0] >= length $_[0];
+        redo ATTRS;
+      } # ATTRS
+    } elsif ($_[0] =~ m{\G</?[A-Za-z][^\x09\x0A\x0C\x0D\x20>]*}gc) {
+      {
+        _get_attr ($_[0]) and redo;
+      }
+    } elsif ($_[0] =~ m{\G<[!/?][^>]*}gc) {
+      #
+    }
+
+    # 3. Next byte
+    $_[0] =~ /\G[^<]+/gc || $_[0] =~ /\G</gc;
+    last LOOP if pos $_[0] >= length $_[0];
+    redo LOOP;
+  } # LOOP
+
+  for my $font_name (sort { $self->{fonts}->{$b} <=> $self->{fonts}->{$a} } keys %{$self->{fonts}}) {
+    my $f = $FontDefs->{$font_name};
+    if (defined $f) {
+      return $f;
+    }
+  }
+
+  return undef;
+} # _detect_font
+
 1;
 
 =head1 LICENSE
 
-Copyright 2007-2017 Wakaba <wakaba@suikawiki.org>.
+Copyright 2007-2025 Wakaba <wakaba@suikawiki.org>.
 
 This library is free software; you can redistribute it and/or modify
 it under the same terms as Perl itself.
diff --git a/lib/Web/Encoding/UnivCharDet.pm b/lib/Web/Encoding/UnivCharDet.pm
@@ -64,7 +64,6 @@ sub reset ($) {
   my $self = $_[0];
   $self->{done} = 0;
   $self->{best_guess} = -1;
-  $self->{in_tag} = 0;
   $self->{start} = 1;
   $self->{detected_charset} = undef;
   $self->{got_data} = undef;
@@ -74,13 +73,14 @@ sub reset ($) {
   delete $self->{esc_charset_prober};
   delete $self->{utf1632_prober};
   delete $self->{reported};
-  #delete $self->{nbsp_found};
+  delete $self->{nbsp_found};
   delete $self->{esc_found};
   delete $self->{binary_found};
   $self->{win1250_refs} = 0;
   $self->{win1252_refs} = 0;
   $self->{unicode_refs} = 0;
   delete $self->{resolve_latin1_refs};
+  delete $self->{amp};
 } # reset
 
 sub handle_data ($$) {
@@ -124,10 +124,9 @@ sub handle_data ($$) {
   for my $i (0..($length - 1)) {
     my $c = ord substr $_[1], $i, 1;
     $zero++ if $c == 0x00;
-    #if ($c == 0xA0) {
-    #  $self->{nbsp_found} = 1;
-    #} elsif ($c & 0x80) {
-    if ($c & 0x80 and $c != 0xA0) {
+    if ($c == 0xA0) {
+      $self->{nbsp_found} = 1;
+    } elsif ($c & 0x80) {
       if ($self->{input_state} ne 'high byte') {
         $self->{input_state} = 'high byte';
         $high = 1;
@@ -151,6 +150,7 @@ sub handle_data ($$) {
         }
         $self->{last_char} = $c;
       }
+      
       if (defined $self->{amp}) {
         if ($c == 0x3B) {
           if (defined $Web::Encoding::UnivCharDet::Defs::Latin1Entities->{$self->{amp}}) {
@@ -188,7 +188,7 @@ sub handle_data ($$) {
         } else {
           delete $self->{amp};
         }
-      }
+      } # amp
     }
   } # $i
 
@@ -346,10 +346,10 @@ sub data_end ($) {
       #
     } elsif ($self->{binary_found}) {
       #
-    #} elsif ($self->{nbsp_found}) {
-    #  $self->{reported} = 'windows-1252';
+    } elsif ($self->{nbsp_found}) {
+      $self->{reported} = 'windows-1252';
     } else {
-      $self->{reported} = 'windows-1252'; # ascii
+      $self->{reported} = 'ascii';
     }
   }
 } # data_end
diff --git a/t/Web-Encoding-Sniffer.t b/t/Web-Encoding-Sniffer.t
@@ -37,9 +37,10 @@ for my $test_file_path ($tests_path->children (qr/\.dat$/)) {
       is $sniffer->encoding, $test->{encoding}->[1]->[0];
       is $sniffer->confident ? 'certain' : 'tentative', $test->{confidence}->[1]->[0];
       is $sniffer->source, $test->{source}->[1]->[0];
+      is $sniffer->font_encoding, $test->{font}->[1]->[0];
 
       done $c;
-    } n => 3, name => [$file_name, $test->{name}->[0] || $test->{data}->[0]];
+    } n => 4, name => [$file_name, $test->{name}->[0] || $test->{data}->[0]];
   };
 } # $test_file_path
 
diff --git a/t_deps/tests b/t_deps/tests
@@ -1 +1 @@
-Subproject commit 5fc59a33b52eaf24638b87ba45f4516c2f8a169e
+Subproject commit 430ae367f3de928385fe1115f68b5e528a1895e0