Skip to content

Commit 5d3df3e

Browse files
committed
Added UTF-16 and UTF-32 detections
1 parent eda75e3 commit 5d3df3e

File tree

5 files changed

+383
-42
lines changed

5 files changed

+383
-42
lines changed

lib/Web/Encoding/UnivCharDet.pm

Lines changed: 56 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ sub new ($;%) {
99
my %args = @_;
1010

1111
$self->{filter} = {ja => 1, zh_hant => 1, zh_hans => 1, ko => 1, non_cjk => 1};
12-
$self->{filter}->{utf32} = 1 if $args{utf32};
12+
$self->{filter}->{utf} = 1 if $args{utf};
1313

1414
return $self;
1515
} # new
@@ -23,7 +23,7 @@ sub _detector ($) {
2323
$filter |= Web::Encoding::UnivCharDet::Defs::FILTER_KOREAN () if $_[0]->{filter}->{ko};
2424
$filter |= Web::Encoding::UnivCharDet::Defs::FILTER_NON_CJK () if $_[0]->{filter}->{non_cjk};
2525
my $x = Web::Encoding::UnivCharDet::UniversalDetector->new ($filter);
26-
$x->{utf32} = 1 if $_[0]->{filter}->{utf32};
26+
$x->{utf} = 1 if $_[0]->{filter}->{utf};
2727
$x;
2828
};
2929
} # _detector
@@ -48,11 +48,11 @@ sub _dump ($) {
4848
package Web::Encoding::UnivCharDet::UniversalDetector;
4949
our $VERSION = '1.0';
5050
use Web::Encoding::UnivCharDet::CharsetProber;
51+
use Web::Encoding::UnivCharDet::UTFCharsetProber;
5152

5253
sub new ($$) {
5354
my $self = bless {
5455
lang_filter => $_[1],
55-
charset_probers => [],
5656
}, $_[0];
5757
$self->reset;
5858
return $self;
@@ -68,8 +68,9 @@ sub reset ($) {
6868
$self->{got_data} = undef;
6969
$self->{input_state} = 'pure ascii';
7070
$self->{last_char} = 0x00;
71-
$self->{esc_charset_prober}->reset if $self->{esc_charset_prober};
72-
$_->reset for grep { $_ } @{$self->{charset_probers}};
71+
$self->{charset_probers} = [];
72+
delete $self->{esc_charset_prober};
73+
delete $self->{utf1632_prober};
7374
} # reset
7475

7576
sub handle_data ($$) {
@@ -88,7 +89,7 @@ sub handle_data ($$) {
8889
$self->{detected_charset} = 'utf-16le';
8990
}
9091

91-
if ($self->{utf32}) {
92+
if ($self->{utf}) {
9293
## <https://github.com/mozilla/gecko-dev/commit/68332f717f14e8f2467ca4f2c521ed8fe6eff71d>
9394
if ($_[1] =~ /^\xFE\xFF\x00\x00/) {
9495
$self->{detected_charset} = 'x-iso-10646-ucs-4-3412';
@@ -107,16 +108,20 @@ sub handle_data ($$) {
107108
}
108109
} # start
109110

110-
for my $i (0..((length $_[1]) - 1)) {
111+
my $length = length $_[1];
112+
my $zero = 0;
113+
for my $i (0..($length - 1)) {
111114
my $c = ord substr $_[1], $i, 1;
115+
$zero++ if $c == 0x00;
112116
if ($c & 0x80 and $c != 0xA0) {
113117
if ($self->{input_state} ne 'high byte') {
114118
$self->{input_state} = 'high byte';
115-
delete $self->{esc_charset_prober} if $self->{esc_charset_prober};
119+
delete $self->{esc_charset_prober};
120+
delete $self->{utf1632_prober};
116121

117122
$self->{charset_probers}->[0]
118123
||= Web::Encoding::UnivCharDet::CharsetProber::MBCSGroup->new
119-
($self->{lang_filter});
124+
($self->{lang_filter});
120125
$self->{charset_probers}->[1]
121126
||= Web::Encoding::UnivCharDet::CharsetProber::SBCSGroup->new
122127
if $self->{lang_filter} & Web::Encoding::UnivCharDet::Defs::FILTER_NON_CJK;
@@ -133,51 +138,75 @@ sub handle_data ($$) {
133138
}
134139
} # $i
135140

141+
if ($self->{utf} and $zero) {
142+
if ($zero / ($length || 1) > 0.1) { # random threshold
143+
$self->{charset_probers} = [];
144+
}
145+
$self->{utf1632_prober} ||= Web::Encoding::UnivCharDet::UTFCharsetProber->new;
146+
}
147+
if (defined $self->{utf1632_prober}) {
148+
{
149+
my $st = $self->{utf1632_prober}->handle_data ($_[1]);
150+
if ($st eq 'found it') {
151+
$self->{done} = 1;
152+
$self->{detected_charset} = $self->{utf1632_prober}->get_charset_name; # non-undef when found
153+
return 1;
154+
}
155+
}
156+
}
157+
136158
if ($self->{input_state} eq 'esc ascii') {
137159
$self->{esc_charset_prober}
138160
||= Web::Encoding::UnivCharDet::CharsetProber::ESC->new
139-
($self->{lang_filter});
140-
my $st = $self->{esc_charset_prober}->handle_data ($_[1]);
141-
if ($st eq 'found it') {
142-
$self->{done} = 1;
143-
$self->{detected_charset} = $self->{esc_charset_prober}->get_charset_name;
161+
($self->{lang_filter});
162+
{
163+
my $st = $self->{esc_charset_prober}->handle_data ($_[1]);
164+
if ($st eq 'found it') {
165+
$self->{done} = 1;
166+
$self->{detected_charset} = $self->{esc_charset_prober}->get_charset_name; # non-undef when found
167+
return 1;
168+
}
144169
}
145170
} elsif ($self->{input_state} eq 'high byte') {
146-
for (grep { $_ } @{$self->{charset_probers}}) {
171+
for (grep { defined $_ } @{$self->{charset_probers}}) {
147172
my $st = $_->handle_data ($_[1]);
148173
if ($st eq 'found it') {
149174
$self->{done} = 1;
150-
$self->{detected_charset} = $_->get_charset_name;
175+
$self->{detected_charset} = $_->get_charset_name; # non-undef when found
151176
return 1;
152177
}
153178
}
154179
}
155-
180+
156181
return 1;
157182
} # handle_data
158183

159184
sub data_end ($) {
160185
my $self = $_[0];
161186
return unless $self->{got_data};
162187

163-
if ($self->{detected_charset}) {
188+
if (defined $self->{detected_charset}) {
164189
$self->{done} = 1;
165190
$self->{reported} = $self->{detected_charset};
166191
return;
167192
}
168193

194+
if (defined $self->{utf1632_prober}) {
195+
$self->{reported} = $self->{utf1632_prober}->get_charset_name; # or undef
196+
}
197+
169198
if ($self->{input_state} eq 'high byte') {
170199
my $max_prober_confidence = 0.0;
171200
my $max_prober;
172-
for (grep { $_ } @{$self->{charset_probers}}) {
201+
for (grep { defined $_ } @{$self->{charset_probers}}) {
173202
my $prober_confidence = $_->get_confidence;
174203
if ($prober_confidence > $max_prober_confidence) {
175204
$max_prober_confidence = $prober_confidence;
176205
$max_prober = $_;
177206
}
178207
}
179208
if ($max_prober_confidence > Web::Encoding::UnivCharDet::Defs::MINIMUM_THRESHOLD) {
180-
$self->{reported} = $max_prober->get_charset_name;
209+
$self->{reported} = $max_prober->get_charset_name; # or undef (but unlikely?)
181210
}
182211
}
183212
} # data_end
@@ -187,7 +216,13 @@ sub get_reported_charset ($) {
187216
} # get_reported_charset
188217

189218
sub dump_status ($) {
190-
$_->dump_status for grep { $_ } @{$_[0]->{charset_probers}};
219+
my $self = $_[0];
220+
print "Input state: $self->{input_state}\n";
221+
$_->dump_status for grep { defined $_ }
222+
@{$self->{charset_probers}},
223+
$self->{esc_charset_prober},
224+
$self->{utf1632_prober};
225+
print "Reported: @{[$self->{reported} // '']}\n";
191226
} # dump_status
192227

193228
1;

lib/Web/Encoding/UnivCharDet.pod

Lines changed: 25 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,8 @@ Return a new instance of the universal detector.
2929

3030
Zero or more named parameters can be specified.
3131

32-
If C<utf32> parameter is set to a true value, the UTF-32 BOM sniffing
33-
is enabled. Otherwise, it is disabled.
32+
If C<utf> parameter is set to a true value, the UTF-32 and UTF-16
33+
detection is enabled. Otherwise, it is disabled.
3434

3535
=item $charset = $det->detect_byte_string ($bytes)
3636

@@ -47,14 +47,18 @@ are returned:
4747
windows-1253 iso-8859-8 windows-1255 windows-1252
4848

4949
Note that C<x-euc-tw> and C<ibm855> are not supported by the Encoding
50-
Standard.
50+
Standard. Encoding labels C<iso-2022-cn>, C<hz-gb-2312>, and
51+
C<iso-2022-kr> are supported but associated with the replacement
52+
encoding.
5153

52-
If the UTF-32 BOM sniffing is enabled, one of the following encoding
53-
labels might also be returned:
54+
If the UTF-32 and UTF-16 detection is enabled, one of the following
55+
encoding labels might also be returned:
5456

5557
utf-32be utf-32le x-iso-10646-ucs-4-3412 x-iso-10646-ucs-4-2143
58+
utf-16be utf-16le
5659

57-
Note that they are not supported by the Encoding Standard.
60+
Note that labels other than C<utf-16be> and C<utf-16le> are not
61+
supported by the Encoding Standard.
5862

5963
Returned encoding names are in lowercase.
6064

@@ -91,15 +95,19 @@ S. Li, K. Momoi. Netscape. In Proceedings of the 19th International
9195
Unicode Conference.
9296
<https://www-archive.mozilla.org/projects/intl/universalcharsetdetection>.
9397

94-
[MOZILLAUNIVDET] universalchardet in Mozilla repository.
95-
96-
This implementation derives from Mozilla's C++ implementation as of
97-
May 1, 2013
98+
[MOZILLAUNIVDET] universalchardet in Mozilla repository. This
99+
implementation derives from Mozilla's C++ implementation as of May 1,
100+
2013
98101
<https://hg.mozilla.org/mozilla-central/archive/c0e81c0222fc.zip>.
99102

103+
[MOZILLAUNIVDETBOM]
104+
<https://github.com/mozilla/gecko-dev/commit/68332f717f14e8f2467ca4f2c521ed8fe6eff71d>.
105+
106+
[CHARDET] Chardet: The Universal Character Encoding Detector,
107+
<https://github.com/chardet/chardet>.
108+
100109
[SWUNIVERSALCHARDET] SuikaWiki:UniversalCharDet
101-
<https://suika.suikawiki.org/~wakaba/wiki/sw/n/UniversalCharDet> (In
102-
Japanese).
110+
<https://wiki.suikawiki.org/n/UniversalCharDet>.
103111

104112
=head1 AUTHOR
105113

@@ -108,13 +116,16 @@ Wakaba <wakaba@suikawiki.org>.
108116
=head1 ACKNOWLEDGEMENTS
109117

110118
Thanks to the authors and contributors of Mozilla's original universal
111-
detector implementation [MOZILLAUNIVDET], from which the Perl port
112-
derives.
119+
detector implementation [MOZILLAUNIVDET] and its variants
120+
[MOZILLAUNIVDETBOM] [CHARDET], from which the Perl port derives.
113121

114122
=head1 LICENSE
115123

116124
This Source Code Form is subject to the terms of the Mozilla Public
117125
License, v. 2.0. If a copy of the MPL was not distributed with this
118126
file, You can obtain one at <https://mozilla.org/MPL/2.0/>.
119127

128+
Note that some of files are licensed with different terms. See their
129+
documentations.
130+
120131
=cut

lib/Web/Encoding/UnivCharDet/CharsetProber.pm

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,11 @@ sub filter_with_english_letters ($$) {
7575
return $new;
7676
} # filter_with_english_letters
7777

78+
sub dump_status ($) {
79+
my $self = $_[0];
80+
printf "%s\n", ref $self;
81+
} # dump_status
82+
7883
package Web::Encoding::UnivCharDet::CharsetProber::Latin1;
7984
push our @ISA, qw(Web::Encoding::UnivCharDet::CharsetProber);
8085
our $VERSION = '1.0';
@@ -156,7 +161,8 @@ sub handle_data ($$) {
156161
my $new_buf1 = $self->filter_with_english_letters ($_[1]);
157162

158163
for my $i (0..((length $new_buf1) - 1)) {
159-
my $char_class = $Latin1_CharToClass->[ord substr $new_buf1, $i, 1];
164+
my $c = ord substr $new_buf1, $i, 1;
165+
my $char_class = $Latin1_CharToClass->[$c];
160166
my $freq = $Latin1ClassModel->[$self->{last_char_class}*CLASS_NUM + $char_class];
161167
if ($freq == 0) {
162168
$self->{state} = 'not me';
@@ -557,9 +563,11 @@ package Web::Encoding::UnivCharDet::CharsetProber::MBCSGroup;
557563
push our @ISA, qw(Web::Encoding::UnivCharDet::CharsetProber);
558564
our $VERSION = '1.0';
559565

560-
sub new ($$) {
561-
my $self = bless {}, $_[0];
562-
my $filter = $_[1];
566+
sub new ($$;%) {
567+
my $self = bless {}, shift;
568+
my $filter = shift;
569+
my %args = @_;
570+
563571
$self->{probers} = [
564572
Web::Encoding::UnivCharDet::CharsetProber::UTF8->new,
565573
$filter & Web::Encoding::UnivCharDet::Defs::FILTER_JAPANESE
@@ -587,6 +595,7 @@ sub new ($$) {
587595
($filter == Web::Encoding::UnivCharDet::Defs::FILTER_CHINESE_TRADITIONAL)
588596
: undef,
589597
];
598+
590599
$self->reset;
591600
return $self;
592601
} # new

0 commit comments

Comments
 (0)