Added MacRoman detection support; Fixed utf flagging API for consistency

wakaba · wakaba · commit aa8fa6b19389 · 2025-10-19T23:24:49.000+09:00
diff --git a/lib/Web/Encoding/UnivCharDet.pm b/lib/Web/Encoding/UnivCharDet.pm
@@ -4,12 +4,10 @@ use warnings;
 our $VERSION = '1.0';
 use Web::Encoding::UnivCharDet::Defs;
 
-sub new ($;%) {
+sub new ($) {
   my $self = bless {}, shift;
-  my %args = @_;
   
   $self->{filter} = {ja => 1, zh_hant => 1, zh_hans => 1, ko => 1, non_cjk => 1};
-  $self->{filter}->{utf} = 1 if $args{utf};
   
   return $self;
 } # new
@@ -49,6 +47,7 @@ package Web::Encoding::UnivCharDet::UniversalDetector;
 our $VERSION = '1.0';
 use Web::Encoding::UnivCharDet::CharsetProber;
 use Web::Encoding::UnivCharDet::UTFCharsetProber;
+use Web::Encoding::UnivCharDet::MacCharsetProber;
 
 sub new ($$) {
   my $self = bless {
@@ -71,6 +70,7 @@ sub reset ($) {
   $self->{charset_probers} = [];
   delete $self->{esc_charset_prober};
   delete $self->{utf1632_prober};
+  delete $self->{reported};
 } # reset
 
 sub handle_data ($$) {
@@ -127,6 +127,8 @@ sub handle_data ($$) {
             if $self->{lang_filter} & Web::Encoding::UnivCharDet::Defs::FILTER_NON_CJK;
         $self->{charset_probers}->[2]
             ||= Web::Encoding::UnivCharDet::CharsetProber::Latin1->new;
+        $self->{charset_probers}->[3]
+            ||= Web::Encoding::UnivCharDet::MacCharsetProber::MacRoman->new;
       }
     } else {
       if ($self->{input_state} eq 'pure ascii' and
diff --git a/lib/Web/Encoding/UnivCharDet.pod b/lib/Web/Encoding/UnivCharDet.pod
@@ -23,14 +23,12 @@ Following methods are available:
 
 =item $det = Web::Encoding::UnivCharDet->new
 
-=item $det = Web::Encoding::UnivCharDet->new (NAME => VALUE, ...)
-
 Return a new instance of the universal detector.
 
-Zero or more named parameters can be specified.
+=item $det->filter->{utf} = Boolean (default: false)
 
-If C<utf> parameter is set to a true value, the UTF-32 and UTF-16
-detection is enabled.  Otherwise, it is disabled.
+If true, the UTF-32 and UTF-16 detection is enabled.  Otherwise, it is
+disabled.
 
 =item $charset = $det->detect_byte_string ($bytes)
 
@@ -44,7 +42,7 @@ are returned:
   utf-8 utf-16be utf-16le iso-2022-cn big5 x-euc-tw gb18030 hz-gb-2312
   iso-2022-jp shift_jis euc-jp iso-2022-kr euc-kr iso-8859-5 koi8-r
   windows-1251 x-mac-cyrillic ibm866 ibm855 iso-8859-7 tis-620
-  windows-1253 iso-8859-8 windows-1255 windows-1252
+  windows-1253 iso-8859-8 windows-1255 windows-1252 macintosh
 
 Note that C<x-euc-tw> and C<ibm855> are not supported by the Encoding
 Standard.  Encoding labels C<iso-2022-cn>, C<hz-gb-2312>, and
@@ -90,26 +88,31 @@ defined in the Encoding Standard at the time of writing.)
 
 =head1 SEE ALSO
 
-[UNIVCHARDET] A composite approach to language/encoding detection,
-S. Li, K. Momoi. Netscape. In Proceedings of the 19th International
-Unicode Conference.
+[UNIVCHARDET] "A composite approach to language/encoding detection",
+S. Li, K. Momoi, Proceedings of the 19th International Unicode
+Conference
 <https://www-archive.mozilla.org/projects/intl/universalcharsetdetection>.
 
 [MOZILLAUNIVDET] universalchardet in Mozilla repository.  This
-implementation derives from Mozilla's C++ implementation as of May 1,
+implementation derives from Mozilla's C++ implementation as of 1 May
 2013
 <https://hg.mozilla.org/mozilla-central/archive/c0e81c0222fc.zip>.
 
-[MOZILLAUNIVDETBOM]
+[MOZILLAUNIVDETBOM] "Make Universal Charset Autodetector recognise UTF
+by BOM", Alexey Chernyak, 3 Nov 2005
 <https://github.com/mozilla/gecko-dev/commit/68332f717f14e8f2467ca4f2c521ed8fe6eff71d>.
 
-[CHARDET] Chardet: The Universal Character Encoding Detector,
+[CHARDET] "Chardet: The Universal Character Encoding Detector"
 <https://github.com/chardet/chardet>.
 
-[CHARADE13] Support CP949, fixes #10
+[CHARDETMAC] "Added a prober for MacRoman encoding.", Rob Speer, 29
+Jun 2022
+<https://github.com/chardet/chardet/commit/c292b52a97e57c95429ef559af36845019b88b33>.
+
+[CHARADE13] "Support CP949, fixes #10", puzzlet, 25 Jan 2013
 <https://github.com/sv24-archive/charade/pull/13/files>.
 
-[SWUNIVERSALCHARDET] SuikaWiki:UniversalCharDet
+[SWUNIVERSALCHARDET] "UniversalCharDet", SuikaWiki authors
 <https://wiki.suikawiki.org/n/UniversalCharDet>.
 
 =head1 AUTHOR
@@ -120,8 +123,8 @@ Wakaba <wakaba@suikawiki.org>.
 
 Thanks to the authors and contributors of Mozilla's original universal
 detector implementation [MOZILLAUNIVDET] and its variants
-[MOZILLAUNIVDETBOM] [CHARADE13] [CHARDET], from which the Perl port
-derives.
+[MOZILLAUNIVDETBOM] [CHARADE13] [CHARDET] [CHARDETMAC], from which the
+Perl port derives.
 
 =head1 LICENSE
 
diff --git a/lib/Web/Encoding/UnivCharDet/Defs2.pm b/lib/Web/Encoding/UnivCharDet/Defs2.pm
@@ -79,6 +79,7 @@ This module derived from
 # the Initial Developer. All Rights Reserved.
 #
 # Contributor(s):
+#   Wakaba <wakaba@suikawiki.org>
 #   Mark Pilgrim - port to Python
 #
 # This library is free software; you can redistribute it and/or
diff --git a/lib/Web/Encoding/UnivCharDet/MacCharsetProber.pm b/lib/Web/Encoding/UnivCharDet/MacCharsetProber.pm
@@ -0,0 +1,195 @@
+package Web::Encoding::UnivCharDet::MacCharsetProber;
+use strict;
+use warnings;
+our $VERSION = '1.0';
+use Web::Encoding::UnivCharDet::CharsetProber;
+
+package Web::Encoding::UnivCharDet::MacCharsetProber::MacRoman;
+push our @ISA, qw(Web::Encoding::UnivCharDet::CharsetProber);
+our $VERSION = '1.0';
+
+sub FREQ_CAT_NUM () { 4 }
+sub UDF () { 0 }
+sub OTH () { 1 }
+sub ASC () { 2 }
+sub ASS () { 3 }
+sub ACV () { 4 }
+sub ACO () { 5 }
+sub ASV () { 6 }
+sub ASO () { 7 }
+sub ODD () { 8 }
+sub CLASS_NUM () { 9 }
+
+# The change from Latin1 is that we explicitly look for extended characters
+# that are infrequently-occurring symbols, and consider them to always be
+# improbable. This should let MacRoman get out of the way of more likely
+# encodings in most situations.
+
+my $MacRoman_CharToClass = [
+    OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  # 00 - 07
+    OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  # 08 - 0F
+    OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  # 10 - 17
+    OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  # 18 - 1F
+    OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  # 20 - 27
+    OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  # 28 - 2F
+    OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  # 30 - 37
+    OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  # 38 - 3F
+    OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC,  # 40 - 47
+    ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,  # 48 - 4F
+    ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC,  # 50 - 57
+    ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH,  # 58 - 5F
+    OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS,  # 60 - 67
+    ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS,  # 68 - 6F
+    ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS,  # 70 - 77
+    ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH,  # 78 - 7F
+    ACV, ACV, ACO, ACV, ACO, ACV, ACV, ASV,  # 80 - 87
+    ASV, ASV, ASV, ASV, ASV, ASO, ASV, ASV,  # 88 - 8F
+    ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASV,  # 90 - 97
+    ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV,  # 98 - 9F
+    OTH, OTH, OTH, OTH, OTH, OTH, OTH, ASO,  # A0 - A7
+    OTH, OTH, ODD, ODD, OTH, OTH, ACV, ACV,  # A8 - AF
+    OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH,  # B0 - B7
+    OTH, OTH, OTH, OTH, OTH, OTH, ASV, ASV,  # B8 - BF
+    OTH, OTH, ODD, OTH, ODD, OTH, OTH, OTH,  # C0 - C7
+    OTH, OTH, OTH, ACV, ACV, ACV, ACV, ASV,  # C8 - CF
+    OTH, OTH, OTH, OTH, OTH, OTH, OTH, ODD,  # D0 - D7
+    ASV, ACV, ODD, OTH, OTH, OTH, OTH, OTH,  # D8 - DF
+    OTH, OTH, OTH, OTH, OTH, ACV, ACV, ACV,  # E0 - E7
+    ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV,  # E8 - EF
+    ODD, ACV, ACV, ACV, ACV, ASV, ODD, ODD,  # F0 - F7
+    ODD, ODD, ODD, ODD, ODD, ODD, ODD, ODD,  # F8 - FF
+];
+
+# 0 : illegal
+# 1 : very unlikely
+# 2 : normal
+# 3 : very likely
+my $MacRomanClassModel = [
+# UDF OTH ASC ASS ACV ACO ASV ASO ODD
+    0,  0,  0,  0,  0,  0,  0,  0,  0,  # UDF
+    0,  3,  3,  3,  3,  3,  3,  3,  1,  # OTH
+    0,  3,  3,  3,  3,  3,  3,  3,  1,  # ASC
+    0,  3,  3,  3,  1,  1,  3,  3,  1,  # ASS
+    0,  3,  3,  3,  1,  2,  1,  2,  1,  # ACV
+    0,  3,  3,  3,  3,  3,  3,  3,  1,  # ACO
+    0,  3,  1,  3,  1,  1,  1,  3,  1,  # ASV
+    0,  3,  1,  3,  1,  1,  3,  3,  1,  # ASO
+    0,  1,  1,  1,  1,  1,  1,  1,  1,  # ODD
+];
+
+sub new ($) {
+  my $self = bless {}, $_[0];
+  $self->reset;
+  return $self;
+} # new
+
+sub reset ($) {
+  my $self = $_[0];
+  $self->{state} = 'detecting';
+  $self->{last_char_class} = OTH;
+  $self->{freq_counter}->[$_] = 0 for 0..(FREQ_CAT_NUM - 1);
+
+  # express the prior that MacRoman is a somewhat rare encoding; this
+  # can be done by starting out in a slightly improbable state that
+  # must be overcome
+  $self->{freq_counter}->[2] = 10;
+} # reset
+
+sub get_charset_name ($) { 'macintosh' }
+
+sub handle_data ($$) {
+  my $self = $_[0];
+  my $new_buf1 = $self->filter_with_english_letters ($_[1]);
+
+  for my $i (0..((length $new_buf1) - 1)) {
+    my $c = ord substr $new_buf1, $i, 1;
+    my $char_class = $MacRoman_CharToClass->[$c];
+    my $freq = $MacRomanClassModel->[$self->{last_char_class}*CLASS_NUM + $char_class];
+    if ($freq == 0) {
+      $self->{state} = 'not me';
+      last;
+    }
+    $self->{freq_counter}->[$freq]++;
+    $self->{last_char_class} = $char_class;
+  } # $i
+
+  return $self->{state};
+} # handle_data
+
+sub get_confidence ($) {
+  my $self = $_[0];
+  if ($self->{state} eq 'not me') {
+    return 0.01;
+  }
+
+  my $total = 0;
+  for my $i (0..(FREQ_CAT_NUM - 1)) {
+    $total += $self->{freq_counter}->[$i];
+  }
+
+  my $confidence;
+  if ($total < 0.01) {
+    $confidence = 0.0;
+  } else {
+    $confidence = ($self->{freq_counter}->[3] - $self->{freq_counter}->[1] * 20.0) / $total;
+  }
+  $confidence = 0.0 if $confidence < 0.0;
+
+  ## lower the confidence of MacRoman so that other more accurate
+  ## detector can take priority.
+  $confidence *= 0.73;
+
+  return $confidence;
+} # get_confidence
+
+sub dump_status ($) {
+  my $self = $_[0];
+  printf " MacRomanProber: %1.3f [%s]\n",
+      $self->get_confidence, $self->get_charset_name;
+} # dump_status
+
+1;
+
+=head1 AUTHOR
+
+Wakaba <wakaba@suikawiki.org>.
+
+=head1 ACKNOWLEDGEMENTS
+
+This module derived from
+<https://github.com/chardet/chardet/commit/c292b52a97e57c95429ef559af36845019b88b33>.
+
+=head1 LICENSE
+
+######################## BEGIN LICENSE BLOCK ########################
+# This code was modified from latin1prober.py by Rob Speer <rob@lumino.so>.
+# The Original Code is Mozilla Universal charset detector code.
+#
+# The Initial Developer of the Original Code is
+# Netscape Communications Corporation.
+# Portions created by the Initial Developer are Copyright (C) 2001
+# the Initial Developer. All Rights Reserved.
+#
+# Contributor(s):
+#   Wakaba <wakaba@suikawiki.org>
+#   Rob Speer - adapt to MacRoman encoding
+#   Mark Pilgrim - port to Python
+#   Shy Shalom - original C code
+#
+# This library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+#
+# This library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+#
+# You should have received a copy of the GNU Lesser General Public
+# License along with this library; if not, write to the Free Software
+# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
+# 02110-1301  USA
+######################### END LICENSE BLOCK #########################
+
+=cut
diff --git a/lib/Web/Encoding/UnivCharDet/UTFCharsetProber.pm b/lib/Web/Encoding/UnivCharDet/UTFCharsetProber.pm
@@ -265,6 +265,7 @@ as of 19 October Reiwa 7 (2025), i.e.
 ######################## BEGIN LICENSE BLOCK ########################
 #
 # Contributor(s):
+#   Wakaba <wakaba@suikawiki.org>
 #   Jason Zavaglia
 #
 # This library is free software; you can redistribute it and/or
diff --git a/t/Web-Encoding-UnivCharDet.t b/t/Web-Encoding-UnivCharDet.t
@@ -51,19 +51,23 @@ test {
   my $det = Web::Encoding::UnivCharDet->new;
   is $det->detect_byte_string ("\xFF\xFE\x00\x00"), 'utf-16le';
   is $det->detect_byte_string ("\x00\x00\xFF\xFE"), 'windows-1252';
+  is $det->detect_byte_string ("abc\x8Fxyz\x8D"), 'macintosh';
 
   done $c;
-} name => 'no utf flag', n => 2;
+} name => 'no utf flag', n => 3;
 
 test {
   my $c = shift;
   
-  my $det = Web::Encoding::UnivCharDet->new (utf => 1);
+  my $det = Web::Encoding::UnivCharDet->new;
+  $det->filter->{utf} = 1;
+  
   is $det->detect_byte_string ("\xFF\xFE\x00\x00"), 'utf-32le';
   is $det->detect_byte_string ("\x00\x00\xFF\xFE"), 'x-iso-10646-ucs-4-2143';
+  is $det->detect_byte_string ("abc\x8Fxyz\x8D"), 'macintosh';
 
   done $c;
-} name => 'with utf flag', n => 2;
+} name => 'with utf flag', n => 3;
 
 run_tests;
 

Original file line number	Diff line number	Diff line change
`@@ -79,6 +79,7 @@ This module derived from`
`79`	`79`	`# the Initial Developer. All Rights Reserved.`
`80`	`80`	`#`
`81`	`81`	`# Contributor(s):`
	`82`	`+# Wakaba <wakaba@suikawiki.org>`
`82`	`83`	`# Mark Pilgrim - port to Python`
`83`	`84`	`#`
`84`	`85`	`# This library is free software; you can redistribute it and/or`
Original file line number	Diff line number	Diff line change
`@@ -265,6 +265,7 @@ as of 19 October Reiwa 7 (2025), i.e.`
`265`	`265`	`######################## BEGIN LICENSE BLOCK ########################`
`266`	`266`	`#`
`267`	`267`	`# Contributor(s):`
	`268`	`+# Wakaba <wakaba@suikawiki.org>`
`268`	`269`	`# Jason Zavaglia`
`269`	`270`	`#`
`270`	`271`	`# This library is free software; you can redistribute it and/or`