Skip to content

Commit aa8fa6b

Browse files
committed
Added MacRoman detection support; Fixed utf flagging API for consistency
1 parent 48b2080 commit aa8fa6b

File tree

6 files changed

+228
-22
lines changed

6 files changed

+228
-22
lines changed

lib/Web/Encoding/UnivCharDet.pm

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,10 @@ use warnings;
44
our $VERSION = '1.0';
55
use Web::Encoding::UnivCharDet::Defs;
66

7-
sub new ($;%) {
7+
sub new ($) {
88
my $self = bless {}, shift;
9-
my %args = @_;
109

1110
$self->{filter} = {ja => 1, zh_hant => 1, zh_hans => 1, ko => 1, non_cjk => 1};
12-
$self->{filter}->{utf} = 1 if $args{utf};
1311

1412
return $self;
1513
} # new
@@ -49,6 +47,7 @@ package Web::Encoding::UnivCharDet::UniversalDetector;
4947
our $VERSION = '1.0';
5048
use Web::Encoding::UnivCharDet::CharsetProber;
5149
use Web::Encoding::UnivCharDet::UTFCharsetProber;
50+
use Web::Encoding::UnivCharDet::MacCharsetProber;
5251

5352
sub new ($$) {
5453
my $self = bless {
@@ -71,6 +70,7 @@ sub reset ($) {
7170
$self->{charset_probers} = [];
7271
delete $self->{esc_charset_prober};
7372
delete $self->{utf1632_prober};
73+
delete $self->{reported};
7474
} # reset
7575

7676
sub handle_data ($$) {
@@ -127,6 +127,8 @@ sub handle_data ($$) {
127127
if $self->{lang_filter} & Web::Encoding::UnivCharDet::Defs::FILTER_NON_CJK;
128128
$self->{charset_probers}->[2]
129129
||= Web::Encoding::UnivCharDet::CharsetProber::Latin1->new;
130+
$self->{charset_probers}->[3]
131+
||= Web::Encoding::UnivCharDet::MacCharsetProber::MacRoman->new;
130132
}
131133
} else {
132134
if ($self->{input_state} eq 'pure ascii' and

lib/Web/Encoding/UnivCharDet.pod

Lines changed: 19 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -23,14 +23,12 @@ Following methods are available:
2323

2424
=item $det = Web::Encoding::UnivCharDet->new
2525

26-
=item $det = Web::Encoding::UnivCharDet->new (NAME => VALUE, ...)
27-
2826
Return a new instance of the universal detector.
2927

30-
Zero or more named parameters can be specified.
28+
=item $det->filter->{utf} = Boolean (default: false)
3129

32-
If C<utf> parameter is set to a true value, the UTF-32 and UTF-16
33-
detection is enabled. Otherwise, it is disabled.
30+
If true, the UTF-32 and UTF-16 detection is enabled. Otherwise, it is
31+
disabled.
3432

3533
=item $charset = $det->detect_byte_string ($bytes)
3634

@@ -44,7 +42,7 @@ are returned:
4442
utf-8 utf-16be utf-16le iso-2022-cn big5 x-euc-tw gb18030 hz-gb-2312
4543
iso-2022-jp shift_jis euc-jp iso-2022-kr euc-kr iso-8859-5 koi8-r
4644
windows-1251 x-mac-cyrillic ibm866 ibm855 iso-8859-7 tis-620
47-
windows-1253 iso-8859-8 windows-1255 windows-1252
45+
windows-1253 iso-8859-8 windows-1255 windows-1252 macintosh
4846

4947
Note that C<x-euc-tw> and C<ibm855> are not supported by the Encoding
5048
Standard. Encoding labels C<iso-2022-cn>, C<hz-gb-2312>, and
@@ -90,26 +88,31 @@ defined in the Encoding Standard at the time of writing.)
9088

9189
=head1 SEE ALSO
9290

93-
[UNIVCHARDET] A composite approach to language/encoding detection,
94-
S. Li, K. Momoi. Netscape. In Proceedings of the 19th International
95-
Unicode Conference.
91+
[UNIVCHARDET] "A composite approach to language/encoding detection",
92+
S. Li, K. Momoi, Proceedings of the 19th International Unicode
93+
Conference
9694
<https://www-archive.mozilla.org/projects/intl/universalcharsetdetection>.
9795

9896
[MOZILLAUNIVDET] universalchardet in Mozilla repository. This
99-
implementation derives from Mozilla's C++ implementation as of May 1,
97+
implementation derives from Mozilla's C++ implementation as of 1 May
10098
2013
10199
<https://hg.mozilla.org/mozilla-central/archive/c0e81c0222fc.zip>.
102100

103-
[MOZILLAUNIVDETBOM]
101+
[MOZILLAUNIVDETBOM] "Make Universal Charset Autodetector recognise UTF
102+
by BOM", Alexey Chernyak, 3 Nov 2005
104103
<https://github.com/mozilla/gecko-dev/commit/68332f717f14e8f2467ca4f2c521ed8fe6eff71d>.
105104

106-
[CHARDET] Chardet: The Universal Character Encoding Detector,
105+
[CHARDET] "Chardet: The Universal Character Encoding Detector"
107106
<https://github.com/chardet/chardet>.
108107

109-
[CHARADE13] Support CP949, fixes #10
108+
[CHARDETMAC] "Added a prober for MacRoman encoding.", Rob Speer, 29
109+
Jun 2022
110+
<https://github.com/chardet/chardet/commit/c292b52a97e57c95429ef559af36845019b88b33>.
111+
112+
[CHARADE13] "Support CP949, fixes #10", puzzlet, 25 Jan 2013
110113
<https://github.com/sv24-archive/charade/pull/13/files>.
111114

112-
[SWUNIVERSALCHARDET] SuikaWiki:UniversalCharDet
115+
[SWUNIVERSALCHARDET] "UniversalCharDet", SuikaWiki authors
113116
<https://wiki.suikawiki.org/n/UniversalCharDet>.
114117

115118
=head1 AUTHOR
@@ -120,8 +123,8 @@ Wakaba <wakaba@suikawiki.org>.
120123

121124
Thanks to the authors and contributors of Mozilla's original universal
122125
detector implementation [MOZILLAUNIVDET] and its variants
123-
[MOZILLAUNIVDETBOM] [CHARADE13] [CHARDET], from which the Perl port
124-
derives.
126+
[MOZILLAUNIVDETBOM] [CHARADE13] [CHARDET] [CHARDETMAC], from which the
127+
Perl port derives.
125128

126129
=head1 LICENSE
127130

lib/Web/Encoding/UnivCharDet/Defs2.pm

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,6 +79,7 @@ This module derived from
7979
# the Initial Developer. All Rights Reserved.
8080
#
8181
# Contributor(s):
82+
# Wakaba <wakaba@suikawiki.org>
8283
# Mark Pilgrim - port to Python
8384
#
8485
# This library is free software; you can redistribute it and/or
Lines changed: 195 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,195 @@
1+
package Web::Encoding::UnivCharDet::MacCharsetProber;
2+
use strict;
3+
use warnings;
4+
our $VERSION = '1.0';
5+
use Web::Encoding::UnivCharDet::CharsetProber;
6+
7+
package Web::Encoding::UnivCharDet::MacCharsetProber::MacRoman;
8+
push our @ISA, qw(Web::Encoding::UnivCharDet::CharsetProber);
9+
our $VERSION = '1.0';
10+
11+
sub FREQ_CAT_NUM () { 4 }
12+
sub UDF () { 0 }
13+
sub OTH () { 1 }
14+
sub ASC () { 2 }
15+
sub ASS () { 3 }
16+
sub ACV () { 4 }
17+
sub ACO () { 5 }
18+
sub ASV () { 6 }
19+
sub ASO () { 7 }
20+
sub ODD () { 8 }
21+
sub CLASS_NUM () { 9 }
22+
23+
# The change from Latin1 is that we explicitly look for extended characters
24+
# that are infrequently-occurring symbols, and consider them to always be
25+
# improbable. This should let MacRoman get out of the way of more likely
26+
# encodings in most situations.
27+
28+
my $MacRoman_CharToClass = [
29+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07
30+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F
31+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17
32+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F
33+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27
34+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F
35+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37
36+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F
37+
OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47
38+
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F
39+
ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57
40+
ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F
41+
OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67
42+
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F
43+
ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77
44+
ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F
45+
ACV, ACV, ACO, ACV, ACO, ACV, ACV, ASV, # 80 - 87
46+
ASV, ASV, ASV, ASV, ASV, ASO, ASV, ASV, # 88 - 8F
47+
ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASV, # 90 - 97
48+
ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # 98 - 9F
49+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, ASO, # A0 - A7
50+
OTH, OTH, ODD, ODD, OTH, OTH, ACV, ACV, # A8 - AF
51+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7
52+
OTH, OTH, OTH, OTH, OTH, OTH, ASV, ASV, # B8 - BF
53+
OTH, OTH, ODD, OTH, ODD, OTH, OTH, OTH, # C0 - C7
54+
OTH, OTH, OTH, ACV, ACV, ACV, ACV, ASV, # C8 - CF
55+
OTH, OTH, OTH, OTH, OTH, OTH, OTH, ODD, # D0 - D7
56+
ASV, ACV, ODD, OTH, OTH, OTH, OTH, OTH, # D8 - DF
57+
OTH, OTH, OTH, OTH, OTH, ACV, ACV, ACV, # E0 - E7
58+
ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # E8 - EF
59+
ODD, ACV, ACV, ACV, ACV, ASV, ODD, ODD, # F0 - F7
60+
ODD, ODD, ODD, ODD, ODD, ODD, ODD, ODD, # F8 - FF
61+
];
62+
63+
# 0 : illegal
64+
# 1 : very unlikely
65+
# 2 : normal
66+
# 3 : very likely
67+
my $MacRomanClassModel = [
68+
# UDF OTH ASC ASS ACV ACO ASV ASO ODD
69+
0, 0, 0, 0, 0, 0, 0, 0, 0, # UDF
70+
0, 3, 3, 3, 3, 3, 3, 3, 1, # OTH
71+
0, 3, 3, 3, 3, 3, 3, 3, 1, # ASC
72+
0, 3, 3, 3, 1, 1, 3, 3, 1, # ASS
73+
0, 3, 3, 3, 1, 2, 1, 2, 1, # ACV
74+
0, 3, 3, 3, 3, 3, 3, 3, 1, # ACO
75+
0, 3, 1, 3, 1, 1, 1, 3, 1, # ASV
76+
0, 3, 1, 3, 1, 1, 3, 3, 1, # ASO
77+
0, 1, 1, 1, 1, 1, 1, 1, 1, # ODD
78+
];
79+
80+
sub new ($) {
81+
my $self = bless {}, $_[0];
82+
$self->reset;
83+
return $self;
84+
} # new
85+
86+
sub reset ($) {
87+
my $self = $_[0];
88+
$self->{state} = 'detecting';
89+
$self->{last_char_class} = OTH;
90+
$self->{freq_counter}->[$_] = 0 for 0..(FREQ_CAT_NUM - 1);
91+
92+
# express the prior that MacRoman is a somewhat rare encoding; this
93+
# can be done by starting out in a slightly improbable state that
94+
# must be overcome
95+
$self->{freq_counter}->[2] = 10;
96+
} # reset
97+
98+
sub get_charset_name ($) { 'macintosh' }
99+
100+
sub handle_data ($$) {
101+
my $self = $_[0];
102+
my $new_buf1 = $self->filter_with_english_letters ($_[1]);
103+
104+
for my $i (0..((length $new_buf1) - 1)) {
105+
my $c = ord substr $new_buf1, $i, 1;
106+
my $char_class = $MacRoman_CharToClass->[$c];
107+
my $freq = $MacRomanClassModel->[$self->{last_char_class}*CLASS_NUM + $char_class];
108+
if ($freq == 0) {
109+
$self->{state} = 'not me';
110+
last;
111+
}
112+
$self->{freq_counter}->[$freq]++;
113+
$self->{last_char_class} = $char_class;
114+
} # $i
115+
116+
return $self->{state};
117+
} # handle_data
118+
119+
sub get_confidence ($) {
120+
my $self = $_[0];
121+
if ($self->{state} eq 'not me') {
122+
return 0.01;
123+
}
124+
125+
my $total = 0;
126+
for my $i (0..(FREQ_CAT_NUM - 1)) {
127+
$total += $self->{freq_counter}->[$i];
128+
}
129+
130+
my $confidence;
131+
if ($total < 0.01) {
132+
$confidence = 0.0;
133+
} else {
134+
$confidence = ($self->{freq_counter}->[3] - $self->{freq_counter}->[1] * 20.0) / $total;
135+
}
136+
$confidence = 0.0 if $confidence < 0.0;
137+
138+
## lower the confidence of MacRoman so that other more accurate
139+
## detector can take priority.
140+
$confidence *= 0.73;
141+
142+
return $confidence;
143+
} # get_confidence
144+
145+
sub dump_status ($) {
146+
my $self = $_[0];
147+
printf " MacRomanProber: %1.3f [%s]\n",
148+
$self->get_confidence, $self->get_charset_name;
149+
} # dump_status
150+
151+
1;
152+
153+
=head1 AUTHOR
154+
155+
Wakaba <wakaba@suikawiki.org>.
156+
157+
=head1 ACKNOWLEDGEMENTS
158+
159+
This module derived from
160+
<https://github.com/chardet/chardet/commit/c292b52a97e57c95429ef559af36845019b88b33>.
161+
162+
=head1 LICENSE
163+
164+
######################## BEGIN LICENSE BLOCK ########################
165+
# This code was modified from latin1prober.py by Rob Speer <rob@lumino.so>.
166+
# The Original Code is Mozilla Universal charset detector code.
167+
#
168+
# The Initial Developer of the Original Code is
169+
# Netscape Communications Corporation.
170+
# Portions created by the Initial Developer are Copyright (C) 2001
171+
# the Initial Developer. All Rights Reserved.
172+
#
173+
# Contributor(s):
174+
# Wakaba <wakaba@suikawiki.org>
175+
# Rob Speer - adapt to MacRoman encoding
176+
# Mark Pilgrim - port to Python
177+
# Shy Shalom - original C code
178+
#
179+
# This library is free software; you can redistribute it and/or
180+
# modify it under the terms of the GNU Lesser General Public
181+
# License as published by the Free Software Foundation; either
182+
# version 2.1 of the License, or (at your option) any later version.
183+
#
184+
# This library is distributed in the hope that it will be useful,
185+
# but WITHOUT ANY WARRANTY; without even the implied warranty of
186+
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
187+
# Lesser General Public License for more details.
188+
#
189+
# You should have received a copy of the GNU Lesser General Public
190+
# License along with this library; if not, write to the Free Software
191+
# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA
192+
# 02110-1301 USA
193+
######################### END LICENSE BLOCK #########################
194+
195+
=cut

lib/Web/Encoding/UnivCharDet/UTFCharsetProber.pm

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -265,6 +265,7 @@ as of 19 October Reiwa 7 (2025), i.e.
265265
######################## BEGIN LICENSE BLOCK ########################
266266
#
267267
# Contributor(s):
268+
# Wakaba <wakaba@suikawiki.org>
268269
# Jason Zavaglia
269270
#
270271
# This library is free software; you can redistribute it and/or

t/Web-Encoding-UnivCharDet.t

Lines changed: 7 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -51,19 +51,23 @@ test {
5151
my $det = Web::Encoding::UnivCharDet->new;
5252
is $det->detect_byte_string ("\xFF\xFE\x00\x00"), 'utf-16le';
5353
is $det->detect_byte_string ("\x00\x00\xFF\xFE"), 'windows-1252';
54+
is $det->detect_byte_string ("abc\x8Fxyz\x8D"), 'macintosh';
5455

5556
done $c;
56-
} name => 'no utf flag', n => 2;
57+
} name => 'no utf flag', n => 3;
5758

5859
test {
5960
my $c = shift;
6061

61-
my $det = Web::Encoding::UnivCharDet->new (utf => 1);
62+
my $det = Web::Encoding::UnivCharDet->new;
63+
$det->filter->{utf} = 1;
64+
6265
is $det->detect_byte_string ("\xFF\xFE\x00\x00"), 'utf-32le';
6366
is $det->detect_byte_string ("\x00\x00\xFF\xFE"), 'x-iso-10646-ucs-4-2143';
67+
is $det->detect_byte_string ("abc\x8Fxyz\x8D"), 'macintosh';
6468

6569
done $c;
66-
} name => 'with utf flag', n => 2;
70+
} name => 'with utf flag', n => 3;
6771

6872
run_tests;
6973

0 commit comments

Comments
 (0)