Skip to content

Commit 574e9ba

Browse files
committed
font-specific encodings should be detected as windows-1252 by sniffer
1 parent 228d763 commit 574e9ba

File tree

4 files changed

+225
-30
lines changed

4 files changed

+225
-30
lines changed

lib/Web/Encoding/Sniffer.pm

Lines changed: 212 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -5,13 +5,14 @@ our $VERSION = '1.0';
55
use Web::Encoding;
66

77
## context
8+
## any - any content (not implemented by browsers)
89
## html - HTML (navigate)
910
## responsehtml - HTML (responseXML)
1011
## xml - XML (navigate, responseXML, responseText)
11-
## css - CSS
12+
## css - CSS external style sheet
1213
## text - text (navigate)
13-
## responsetext - non-XML (responseText)
14-
## classicscript - <script src> with type "classic"
14+
## responsetext - non-XML text (responseText)
15+
## classicscript - JavaScript (<script src> with type "classic")
1516
sub new_from_context ($$) {
1617
return bless {
1718
context => $_[1],
@@ -26,6 +27,10 @@ sub encoding ($) {
2627
return $_[0]->{encoding};
2728
} # encoding
2829

30+
sub font_encoding ($) {
31+
return $_[0]->{font_encoding};
32+
} # font_encoding
33+
2934
sub source ($) {
3035
return $_[0]->{source};
3136
} # source
@@ -211,6 +216,7 @@ sub _prescan_byte_stream ($) {
211216
## locale - user's locale's language tag in lowercase or undef
212217
sub detect ($$;%) {
213218
my ($self, undef, %args) = @_;
219+
delete $self->{font_encoding};
214220

215221
## BOM
216222
if ($_[1] =~ /^\xFE\xFF/) {
@@ -255,7 +261,8 @@ sub detect ($$;%) {
255261
## Prescan xml
256262
if ($self->{context} eq 'html' or
257263
$self->{context} eq 'responsehtml' or
258-
$self->{context} eq 'xml') {
264+
$self->{context} eq 'xml' or
265+
$self->{context} eq 'any') {
259266
my $name = _prescan_xml $_[1];
260267
if (defined $name) {
261268
$self->{encoding} = $name;
@@ -271,7 +278,8 @@ sub detect ($$;%) {
271278

272279
## Prescan html
273280
if ($self->{context} eq 'html' or
274-
$self->{context} eq 'responsehtml') {
281+
$self->{context} eq 'responsehtml' or
282+
$self->{context} eq 'any') {
275283
my $name = _prescan_byte_stream $_[1];
276284
if (defined $name) {
277285
$self->{encoding} = $name;
@@ -285,7 +293,8 @@ sub detect ($$;%) {
285293
}
286294
}
287295

288-
if ($self->{context} eq 'css') {
296+
if ($self->{context} eq 'css' or
297+
$self->{context} eq 'any') {
289298
## <https://drafts.csswg.org/css-syntax/#determine-the-fallback-encoding>
290299
if ($_[1] =~ /\A\x40\x63\x68\x61\x72\x73\x65\x74\x20\x22([\x00-\x21\x23-\x7F]*)\x22\x3B/) {
291300
my $name = encoding_label_to_name $1;
@@ -318,17 +327,37 @@ sub detect ($$;%) {
318327
return;
319328
}
320329

321-
if ($self->{context} eq 'html' or $self->{context} eq 'text') {
322-
## UNIVCHARDET
323-
require Web::Encoding::UnivCharDet;
324-
my $det = Web::Encoding::UnivCharDet->new;
325-
# XXX locale-dependent configuration
326-
my $name = encoding_label_to_name $det->detect_byte_string ($_[1]);
327-
if ($name) {
328-
$self->{encoding} = $name;
329-
delete $self->{confident};
330-
$self->{source} = 'univchardet';
331-
return;
330+
if ($self->{context} eq 'html' or
331+
$self->{context} eq 'text' or
332+
$self->{context} eq 'any') {
333+
## Implementation-dependent detections
334+
{
335+
my $font_def;
336+
if ($self->{context} eq 'html' or $self->{context} eq 'any') {
337+
$font_def = $self->_detect_font ($_[1]); # or undef
338+
}
339+
340+
## UNIVCHARDET
341+
require Web::Encoding::UnivCharDet;
342+
my $det = Web::Encoding::UnivCharDet->new;
343+
# XXX locale-dependent configuration
344+
my $got = $det->detect_byte_string ($_[1]);
345+
my $name = encoding_label_to_name $got;
346+
if (defined $font_def) {
347+
if (not defined $name or not $name eq 'utf-8') {
348+
$self->{encoding} = 'windows-1252';
349+
delete $self->{confident};
350+
$self->{source} = 'font';
351+
$self->{font_encoding} = $font_def->{charset};
352+
return;
353+
}
354+
}
355+
if (defined $name and not $got eq 'ascii') {
356+
$self->{encoding} = $name;
357+
delete $self->{confident};
358+
$self->{source} = 'univchardet';
359+
return;
360+
}
332361
}
333362

334363
## Locale
@@ -363,11 +392,176 @@ sub detect ($$;%) {
363392
return;
364393
} # detect
365394

395+
# XXX
396+
my $FontDefs = {
397+
"limon s1" => {charset => 'x-abc'},
398+
aniezhai => {charset => "x-aniezhai"},
399+
"adarshalipiexp" => {charset => "x-adarshalipiexp"},
400+
"adhawin-tamil" => {charset => "x-adhawin"},
401+
"adhawin-tamil regular" => {charset => "x-adhawin"},
402+
adhawintamil => {charset => "x-adhawin"},
403+
amudham2000 => {charset => "x-amudham2000"},
404+
"arial am" => {charset => "armscii-8"},
405+
"arial latarm" => {charset => "armscii-8"},
406+
au => {charset => "x-au"},
407+
bhaskar => {charset => "bhaskar"},
408+
chanakya => {charset => "x-chanakya"},
409+
eenadu => {charset => "x-eenadu"},
410+
epatrika => {charset => "x-epatrika"},
411+
rswwwnet => {charset => "georgian-academy"},
412+
trg1 => {charset => "georgian-academy"},
413+
"bpg classic dina" => {charset => "georgian-academy"},
414+
gopika => {charset => "x-gopika"},
415+
htchanakya => {charset => "htchanakya"},
416+
inaimathi => {charset => "x-inaimathi"},
417+
"inaimathi-1.8" => {charset => "x-inaimathi"},
418+
jagran => {charset => "jagran"},
419+
"ml-ttkarthika" => {charset => "x-karthika"},
420+
lokweb => {charset => "x-lokweb"},
421+
"lt-tm-barani" => {charset => "x-tam-lttmbarani"},
422+
"mac c swiss" => {charset => "x-mac-c-swiss"},
423+
"knw-ttnandi" => {charset => "x-nandi"},
424+
"utopic" => {charset => "x-utopic"},
425+
"unq_ttabid" => {charset => "x-pascii"},
426+
pothana => {charset => "x-pothana"},
427+
"sanskrit new" => {charset => "x-sanskrit-new"},
428+
"or-ttsarala" => {charset => "x-sarala"},
429+
"shree-mal-0502" => {charset => "x-shree-mal-0502"},
430+
"shree-tel-0900" => {charset => "x-shree-tel-0900"},
431+
shree802 => {charset => "x-shree802"},
432+
"subak-1" => {charset => "x-subak"},
433+
"dv-ttsurekh" => {charset => "x-surekh"},
434+
suritlr => {charset => "x-suritlr"},
435+
suritlk => {charset => "x-suritlk"},
436+
webtamil => {charset => "x-tam-webtamil"},
437+
"telugu lipi" => {charset => "x-telugu-lipi"},
438+
thoolika => {charset => "x-thoolika"},
439+
tikkana => {charset => "x-tikkana"},
440+
tboomis => {charset => "x-tam-tboomis"},
441+
tboomih => {charset => "x-tam-tboomis"},
442+
tboomi => {charset => "x-tam-tboomis"},
443+
tmnews => {charset => "x-tam-tmnews"},
444+
telugufont => {charset => "x-telugufont"},
445+
"tab-anna" => {charset => "tab"},
446+
"tab_inaimathi" => {charset => "tab"},
447+
"tab-lfs-kamban" => {charset => "tab"},
448+
'tam-kalaignar' => {charset => "tam"},
449+
"tsc_janani" => {charset => "tscii"},
450+
"thunaivantsc" => {charset => "tscii"},
451+
"tscsaiindira" => {charset => "tscii"},
452+
"tscsaisai" => {charset => "tscii"},
453+
"tscarial" => {charset => "tscii"},
454+
"tsccomic" => {charset => "tscii"},
455+
"tscmylai" => {charset => "tscii"},
456+
"tsctimes" => {charset => "tscii"},
457+
"tscverdana" => {charset => "tscii"},
458+
"tsc_avarangal" => {charset => "tscii"},
459+
"tsc_avarangalfxd" => {charset => "tscii"},
460+
"tsc_kannadaasan" => {charset => "tscii"},
461+
"tsc_paranar" => {charset => "tscii"},
462+
"tsc_thunaivan" => {charset => "tscii"},
463+
"tsc-sri" => {charset => "tscii"},
464+
tscu_inaimathi => {charset => "tscii"},
465+
inaimathitsc => {charset => "tscii"},
466+
tneritsc => {charset => "tscii"},
467+
"perathanaitsc" => {charset => "tscii"},
468+
"aparanartsc" => {charset => "tscii"},
469+
"comictsc" => {charset => "tscii"},
470+
"maduramtsc" => {charset => "tscii"},
471+
"mylaifixtsc" => {charset => "tscii"},
472+
"mylaitsc" => {charset => "tscii"},
473+
"nanthinitsc" => {charset => "tscii"},
474+
"sri-tsc" => {charset => "tscii"},
475+
"timestsc" => {charset => "tscii"},
476+
"tneritsc" => {charset => "tscii"},
477+
"tamil_avarangal31tsc" => {charset => "tscii"},
478+
shivaji01 => {charset => "x-shivaji01"},
479+
vakil_01 => {charset => "x-vakil_01"},
480+
".vntime" => {charset => "x-viet-tcvn"},
481+
"vntime" => {charset => "x-viet-tcvn"},
482+
"vni-aptima" => {charset => "x-viet-vni"},
483+
"vni-helve" => {charset => "x-viet-vni"},
484+
"vni-times" => {charset => "x-viet-vni"},
485+
"vni-internet mail" => {charset => "x-viet-vni"},
486+
"vni couri" => {charset => "x-viet-vni"},
487+
"vps times" => {charset => "x-viet-vps"},
488+
vikatan => {charset => "x-vikatan"},
489+
webdunia => {charset => "x-webdunia"},
490+
xdvng => {charset => 'x-xdvng'},
491+
};
492+
493+
sub _detect_font ($$) {
494+
my $self = shift;
495+
$self->{fonts} = {};
496+
497+
# 1.
498+
(pos $_[0]) = 0;
499+
500+
my $count = 0;
501+
# 2.
502+
LOOP: {
503+
$_[0] =~ /\G<!--+>/gc;
504+
$_[0] =~ /\G<!--.*?-->/gcs;
505+
if ($_[0] =~ /\G<[Ff][Oo][Nn][Tt](?=[\x09\x0A\x0C\x0D\x20\x2F])/gc) {
506+
# 1.
507+
#
508+
509+
# 2.-5.
510+
my $attr_list = {};
511+
512+
# 6.
513+
ATTRS: {
514+
my $attr = _get_attr ($_[0]) or last ATTRS;
515+
516+
# 7.
517+
redo ATTRS if $attr_list->{$attr->{name}};
518+
519+
# 8.
520+
$attr_list->{$attr->{name}} = $attr;
521+
522+
# 9.
523+
if ($attr->{name} eq 'face') {
524+
my $attr_value = $attr->{value};
525+
$attr_value =~ s/\A[\x09\x0A\x0C\x0D\x20]+//;
526+
$attr_value =~ s/[\x09\x0A\x0C\x0D\x20]+\z//;
527+
$attr_value =~ tr/A-Z/a-z/;
528+
$self->{fonts}->{$_}++ for split /[\x09\x0A\x0C\x0D\x20]*,[\x09\x0A\x0C\x0D\x20]*/, $attr_value;
529+
last LOOP if $count++ > 10;
530+
}
531+
532+
# 10.
533+
last LOOP if pos $_[0] >= length $_[0];
534+
redo ATTRS;
535+
} # ATTRS
536+
} elsif ($_[0] =~ m{\G</?[A-Za-z][^\x09\x0A\x0C\x0D\x20>]*}gc) {
537+
{
538+
_get_attr ($_[0]) and redo;
539+
}
540+
} elsif ($_[0] =~ m{\G<[!/?][^>]*}gc) {
541+
#
542+
}
543+
544+
# 3. Next byte
545+
$_[0] =~ /\G[^<]+/gc || $_[0] =~ /\G</gc;
546+
last LOOP if pos $_[0] >= length $_[0];
547+
redo LOOP;
548+
} # LOOP
549+
550+
for my $font_name (sort { $self->{fonts}->{$b} <=> $self->{fonts}->{$a} } keys %{$self->{fonts}}) {
551+
my $f = $FontDefs->{$font_name};
552+
if (defined $f) {
553+
return $f;
554+
}
555+
}
556+
557+
return undef;
558+
} # _detect_font
559+
366560
1;
367561

368562
=head1 LICENSE
369563
370-
Copyright 2007-2017 Wakaba <wakaba@suikawiki.org>.
564+
Copyright 2007-2025 Wakaba <wakaba@suikawiki.org>.
371565
372566
This library is free software; you can redistribute it and/or modify
373567
it under the same terms as Perl itself.

lib/Web/Encoding/UnivCharDet.pm

Lines changed: 10 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,6 @@ sub reset ($) {
6464
my $self = $_[0];
6565
$self->{done} = 0;
6666
$self->{best_guess} = -1;
67-
$self->{in_tag} = 0;
6867
$self->{start} = 1;
6968
$self->{detected_charset} = undef;
7069
$self->{got_data} = undef;
@@ -74,13 +73,14 @@ sub reset ($) {
7473
delete $self->{esc_charset_prober};
7574
delete $self->{utf1632_prober};
7675
delete $self->{reported};
77-
#delete $self->{nbsp_found};
76+
delete $self->{nbsp_found};
7877
delete $self->{esc_found};
7978
delete $self->{binary_found};
8079
$self->{win1250_refs} = 0;
8180
$self->{win1252_refs} = 0;
8281
$self->{unicode_refs} = 0;
8382
delete $self->{resolve_latin1_refs};
83+
delete $self->{amp};
8484
} # reset
8585

8686
sub handle_data ($$) {
@@ -124,10 +124,9 @@ sub handle_data ($$) {
124124
for my $i (0..($length - 1)) {
125125
my $c = ord substr $_[1], $i, 1;
126126
$zero++ if $c == 0x00;
127-
#if ($c == 0xA0) {
128-
# $self->{nbsp_found} = 1;
129-
#} elsif ($c & 0x80) {
130-
if ($c & 0x80 and $c != 0xA0) {
127+
if ($c == 0xA0) {
128+
$self->{nbsp_found} = 1;
129+
} elsif ($c & 0x80) {
131130
if ($self->{input_state} ne 'high byte') {
132131
$self->{input_state} = 'high byte';
133132
$high = 1;
@@ -151,6 +150,7 @@ sub handle_data ($$) {
151150
}
152151
$self->{last_char} = $c;
153152
}
153+
154154
if (defined $self->{amp}) {
155155
if ($c == 0x3B) {
156156
if (defined $Web::Encoding::UnivCharDet::Defs::Latin1Entities->{$self->{amp}}) {
@@ -188,7 +188,7 @@ sub handle_data ($$) {
188188
} else {
189189
delete $self->{amp};
190190
}
191-
}
191+
} # amp
192192
}
193193
} # $i
194194

@@ -346,10 +346,10 @@ sub data_end ($) {
346346
#
347347
} elsif ($self->{binary_found}) {
348348
#
349-
#} elsif ($self->{nbsp_found}) {
350-
# $self->{reported} = 'windows-1252';
349+
} elsif ($self->{nbsp_found}) {
350+
$self->{reported} = 'windows-1252';
351351
} else {
352-
$self->{reported} = 'windows-1252'; # ascii
352+
$self->{reported} = 'ascii';
353353
}
354354
}
355355
} # data_end

t/Web-Encoding-Sniffer.t

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,10 @@ for my $test_file_path ($tests_path->children (qr/\.dat$/)) {
3737
is $sniffer->encoding, $test->{encoding}->[1]->[0];
3838
is $sniffer->confident ? 'certain' : 'tentative', $test->{confidence}->[1]->[0];
3939
is $sniffer->source, $test->{source}->[1]->[0];
40+
is $sniffer->font_encoding, $test->{font}->[1]->[0];
4041

4142
done $c;
42-
} n => 3, name => [$file_name, $test->{name}->[0] || $test->{data}->[0]];
43+
} n => 4, name => [$file_name, $test->{name}->[0] || $test->{data}->[0]];
4344
};
4445
} # $test_file_path
4546

0 commit comments

Comments
 (0)