manakai
diff --git a/‎lib/Web/Encoding/UnivCharDet.pm‎
Lines changed: 56 additions & 21 deletions b/‎lib/Web/Encoding/UnivCharDet.pm‎
Lines changed: 56 additions & 21 deletions
diff --git a/‎lib/Web/Encoding/UnivCharDet.pod‎
Lines changed: 25 additions & 14 deletions b/‎lib/Web/Encoding/UnivCharDet.pod‎
Lines changed: 25 additions & 14 deletions
diff --git a/‎lib/Web/Encoding/UnivCharDet/CharsetProber.pm‎
Lines changed: 13 additions & 4 deletions b/‎lib/Web/Encoding/UnivCharDet/CharsetProber.pm‎
Lines changed: 13 additions & 4 deletions
@@ -9,7 +9,7 @@ sub new ($;%) {
   my %args = @_;
 
   $self->{filter} = {ja => 1, zh_hant => 1, zh_hans => 1, ko => 1, non_cjk => 1};
-  $self->{filter}->{utf32} = 1 if $args{utf32};
+  $self->{filter}->{utf} = 1 if $args{utf};
 
   return $self;
 } # new
@@ -23,7 +23,7 @@ sub _detector ($) {
     $filter |= Web::Encoding::UnivCharDet::Defs::FILTER_KOREAN () if $_[0]->{filter}->{ko};
     $filter |= Web::Encoding::UnivCharDet::Defs::FILTER_NON_CJK () if $_[0]->{filter}->{non_cjk};
     my $x = Web::Encoding::UnivCharDet::UniversalDetector->new ($filter);
-    $x->{utf32} = 1 if $_[0]->{filter}->{utf32};
+    $x->{utf} = 1 if $_[0]->{filter}->{utf};
     $x;
   };
 } # _detector
@@ -48,11 +48,11 @@ sub _dump ($) {
 package Web::Encoding::UnivCharDet::UniversalDetector;
 our $VERSION = '1.0';
 use Web::Encoding::UnivCharDet::CharsetProber;
+use Web::Encoding::UnivCharDet::UTFCharsetProber;
 
 sub new ($$) {
   my $self = bless {
     lang_filter => $_[1],
-    charset_probers => [],
   }, $_[0];
   $self->reset;
   return $self;
@@ -68,8 +68,9 @@ sub reset ($) {
   $self->{got_data} = undef;
   $self->{input_state} = 'pure ascii';
   $self->{last_char} = 0x00;
-  $self->{esc_charset_prober}->reset if $self->{esc_charset_prober};
-  $_->reset for grep { $_ } @{$self->{charset_probers}};
+  $self->{charset_probers} = [];
+  delete $self->{esc_charset_prober};
+  delete $self->{utf1632_prober};
 } # reset
 
 sub handle_data ($$) {
@@ -88,7 +89,7 @@ sub handle_data ($$) {
       $self->{detected_charset} = 'utf-16le';
     }
 
-    if ($self->{utf32}) {
+    if ($self->{utf}) {
       ## <https://github.com/mozilla/gecko-dev/commit/68332f717f14e8f2467ca4f2c521ed8fe6eff71d>
       if ($_[1] =~ /^\xFE\xFF\x00\x00/) {
         $self->{detected_charset} = 'x-iso-10646-ucs-4-3412';
@@ -107,16 +108,20 @@ sub handle_data ($$) {
     }
   } # start
 
-  for my $i (0..((length $_[1]) - 1)) {
+  my $length = length $_[1];
+  my $zero = 0;
+  for my $i (0..($length - 1)) {
     my $c = ord substr $_[1], $i, 1;
+    $zero++ if $c == 0x00;
     if ($c & 0x80 and $c != 0xA0) {
       if ($self->{input_state} ne 'high byte') {
         $self->{input_state} = 'high byte';
-        delete $self->{esc_charset_prober} if $self->{esc_charset_prober};
+        delete $self->{esc_charset_prober};
+        delete $self->{utf1632_prober};
 
         $self->{charset_probers}->[0]
             ||= Web::Encoding::UnivCharDet::CharsetProber::MBCSGroup->new
-                ($self->{lang_filter});
+                    ($self->{lang_filter});
         $self->{charset_probers}->[1]
             ||= Web::Encoding::UnivCharDet::CharsetProber::SBCSGroup->new
             if $self->{lang_filter} & Web::Encoding::UnivCharDet::Defs::FILTER_NON_CJK;
@@ -133,51 +138,75 @@ sub handle_data ($$) {
     }
   } # $i
 
+  if ($self->{utf} and $zero) {
+    if ($zero / ($length || 1) > 0.1) { # random threshold
+      $self->{charset_probers} = [];
+    }
+    $self->{utf1632_prober} ||= Web::Encoding::UnivCharDet::UTFCharsetProber->new;
+  }
+  if (defined $self->{utf1632_prober}) {
+    {
+      my $st = $self->{utf1632_prober}->handle_data ($_[1]);
+      if ($st eq 'found it') {
+        $self->{done} = 1;
+        $self->{detected_charset} = $self->{utf1632_prober}->get_charset_name; # non-undef when found
+        return 1;
+      }
+    }
+  }
+
   if ($self->{input_state} eq 'esc ascii') {
     $self->{esc_charset_prober}
         ||= Web::Encoding::UnivCharDet::CharsetProber::ESC->new
-            ($self->{lang_filter});
-    my $st = $self->{esc_charset_prober}->handle_data ($_[1]);
-    if ($st eq 'found it') {
-      $self->{done} = 1;
-      $self->{detected_charset} = $self->{esc_charset_prober}->get_charset_name;
+                ($self->{lang_filter});
+    {
+      my $st = $self->{esc_charset_prober}->handle_data ($_[1]);
+      if ($st eq 'found it') {
+        $self->{done} = 1;
+        $self->{detected_charset} = $self->{esc_charset_prober}->get_charset_name; # non-undef when found
+        return 1;
+      }
     }
   } elsif ($self->{input_state} eq 'high byte') {
-    for (grep { $_ } @{$self->{charset_probers}}) {
+    for (grep { defined $_ } @{$self->{charset_probers}}) {
       my $st = $_->handle_data ($_[1]);
       if ($st eq 'found it') {
         $self->{done} = 1;
-        $self->{detected_charset} = $_->get_charset_name;
+        $self->{detected_charset} = $_->get_charset_name; # non-undef when found
         return 1;
       }
     }
   }
-
+  
   return 1;
 } # handle_data
 
 sub data_end ($) {
   my $self = $_[0];
   return unless $self->{got_data};
 
-  if ($self->{detected_charset}) {
+  if (defined $self->{detected_charset}) {
     $self->{done} = 1;
     $self->{reported} = $self->{detected_charset};
     return;
   }
 
+  if (defined $self->{utf1632_prober}) {
+    $self->{reported} = $self->{utf1632_prober}->get_charset_name; # or undef
+  }
+
   if ($self->{input_state} eq 'high byte') {
     my $max_prober_confidence = 0.0;
     my $max_prober;
-    for (grep { $_ } @{$self->{charset_probers}}) {
+    for (grep { defined $_ } @{$self->{charset_probers}}) {
       my $prober_confidence = $_->get_confidence;
       if ($prober_confidence > $max_prober_confidence) {
         $max_prober_confidence = $prober_confidence;
         $max_prober = $_;
       }
     }
     if ($max_prober_confidence > Web::Encoding::UnivCharDet::Defs::MINIMUM_THRESHOLD) {
-      $self->{reported} = $max_prober->get_charset_name;
+      $self->{reported} = $max_prober->get_charset_name; # or undef (but unlikely?)
     }
   }
 } # data_end
@@ -187,7 +216,13 @@ sub get_reported_charset ($) {
 } # get_reported_charset
 
 sub dump_status ($) {
-  $_->dump_status for grep { $_ } @{$_[0]->{charset_probers}};
+  my $self = $_[0];
+  print "Input state: $self->{input_state}\n";
+  $_->dump_status for grep { defined $_ }
+      @{$self->{charset_probers}},
+      $self->{esc_charset_prober},
+      $self->{utf1632_prober};
+  print "Reported: @{[$self->{reported} // '']}\n";
 } # dump_status
 
 1;
 
@@ -29,8 +29,8 @@ Return a new instance of the universal detector.
 
 Zero or more named parameters can be specified.
 
-If C<utf32> parameter is set to a true value, the UTF-32 BOM sniffing
-is enabled.  Otherwise, it is disabled.
+If C<utf> parameter is set to a true value, the UTF-32 and UTF-16
+detection is enabled.  Otherwise, it is disabled.
 
 =item $charset = $det->detect_byte_string ($bytes)
 
@@ -47,14 +47,18 @@ are returned:
   windows-1253 iso-8859-8 windows-1255 windows-1252
 
 Note that C<x-euc-tw> and C<ibm855> are not supported by the Encoding
-Standard.
+Standard.  Encoding labels C<iso-2022-cn>, C<hz-gb-2312>, and
+C<iso-2022-kr> are supported but associated with the replacement
+encoding.
 
-If the UTF-32 BOM sniffing is enabled, one of the following encoding
-labels might also be returned:
+If the UTF-32 and UTF-16 detection is enabled, one of the following
+encoding labels might also be returned:
 
   utf-32be utf-32le x-iso-10646-ucs-4-3412 x-iso-10646-ucs-4-2143
+  utf-16be utf-16le
 
-Note that they are not supported by the Encoding Standard.
+Note that labels other than C<utf-16be> and C<utf-16le> are not
+supported by the Encoding Standard.
 
 Returned encoding names are in lowercase.
 
@@ -91,15 +95,19 @@ S. Li, K. Momoi. Netscape. In Proceedings of the 19th International
 Unicode Conference.
 <https://www-archive.mozilla.org/projects/intl/universalcharsetdetection>.
 
-[MOZILLAUNIVDET] universalchardet in Mozilla repository.
-
-This implementation derives from Mozilla's C++ implementation as of
-May 1, 2013
+[MOZILLAUNIVDET] universalchardet in Mozilla repository.  This
+implementation derives from Mozilla's C++ implementation as of May 1,
+2013
 <https://hg.mozilla.org/mozilla-central/archive/c0e81c0222fc.zip>.
 
+[MOZILLAUNIVDETBOM]
+<https://github.com/mozilla/gecko-dev/commit/68332f717f14e8f2467ca4f2c521ed8fe6eff71d>.
+
+[CHARDET] Chardet: The Universal Character Encoding Detector,
+<https://github.com/chardet/chardet>.
+
 [SWUNIVERSALCHARDET] SuikaWiki:UniversalCharDet
-<https://suika.suikawiki.org/~wakaba/wiki/sw/n/UniversalCharDet> (In
-Japanese).
+<https://wiki.suikawiki.org/n/UniversalCharDet>.
 
 =head1 AUTHOR
 
@@ -108,13 +116,16 @@ Wakaba <wakaba@suikawiki.org>.
 =head1 ACKNOWLEDGEMENTS
 
 Thanks to the authors and contributors of Mozilla's original universal
-detector implementation [MOZILLAUNIVDET], from which the Perl port
-derives.
+detector implementation [MOZILLAUNIVDET] and its variants
+[MOZILLAUNIVDETBOM] [CHARDET], from which the Perl port derives.
 
 =head1 LICENSE
 
 This Source Code Form is subject to the terms of the Mozilla Public
 License, v. 2.0. If a copy of the MPL was not distributed with this
 file, You can obtain one at <https://mozilla.org/MPL/2.0/>.
 
+Note that some of files are licensed with different terms.  See their
+documentations.
+
 =cut
@@ -75,6 +75,11 @@ sub filter_with_english_letters ($$) {
   return $new;
 } # filter_with_english_letters
 
+sub dump_status ($) {
+  my $self = $_[0];
+  printf "%s\n", ref $self;
+} # dump_status
+
 package Web::Encoding::UnivCharDet::CharsetProber::Latin1;
 push our @ISA, qw(Web::Encoding::UnivCharDet::CharsetProber);
 our $VERSION = '1.0';
@@ -156,7 +161,8 @@ sub handle_data ($$) {
   my $new_buf1 = $self->filter_with_english_letters ($_[1]);
 
   for my $i (0..((length $new_buf1) - 1)) {
-    my $char_class = $Latin1_CharToClass->[ord substr $new_buf1, $i, 1];
+    my $c = ord substr $new_buf1, $i, 1;
+    my $char_class = $Latin1_CharToClass->[$c];
     my $freq = $Latin1ClassModel->[$self->{last_char_class}*CLASS_NUM + $char_class];
     if ($freq == 0) {
       $self->{state} = 'not me';
@@ -557,9 +563,11 @@ package Web::Encoding::UnivCharDet::CharsetProber::MBCSGroup;
 push our @ISA, qw(Web::Encoding::UnivCharDet::CharsetProber);
 our $VERSION = '1.0';
 
-sub new ($$) {
-  my $self = bless {}, $_[0];
-  my $filter = $_[1];
+sub new ($$;%) {
+  my $self = bless {}, shift;
+  my $filter = shift;
+  my %args = @_;
+  
   $self->{probers} = [
     Web::Encoding::UnivCharDet::CharsetProber::UTF8->new,
     $filter & Web::Encoding::UnivCharDet::Defs::FILTER_JAPANESE
@@ -587,6 +595,7 @@ sub new ($$) {
               ($filter == Web::Encoding::UnivCharDet::Defs::FILTER_CHINESE_TRADITIONAL)
         : undef,
   ];
+
   $self->reset;
   return $self;
 } # new