@@ -5,13 +5,14 @@ our $VERSION = '1.0';
55use Web::Encoding;
66
77# # context
8+ # # any - any content (not implemented by browsers)
89# # html - HTML (navigate)
910# # responsehtml - HTML (responseXML)
1011# # xml - XML (navigate, responseXML, responseText)
11- # # css - CSS
12+ # # css - CSS external style sheet
1213# # text - text (navigate)
13- # # responsetext - non-XML (responseText)
14- # # classicscript - <script src> with type "classic"
14+ # # responsetext - non-XML text (responseText)
15+ # # classicscript - JavaScript ( <script src> with type "classic")
1516sub new_from_context ($$) {
1617 return bless {
1718 context => $_ [1],
@@ -26,6 +27,10 @@ sub encoding ($) {
2627 return $_ [0]-> {encoding };
2728} # encoding
2829
30+ sub font_encoding ($) {
31+ return $_ [0]-> {font_encoding };
32+ } # font_encoding
33+
2934sub source ($) {
3035 return $_ [0]-> {source };
3136} # source
@@ -211,6 +216,7 @@ sub _prescan_byte_stream ($) {
211216# # locale - user's locale's language tag in lowercase or undef
212217sub detect ($$ ;%) {
213218 my ($self , undef , %args ) = @_ ;
219+ delete $self -> {font_encoding };
214220
215221 # # BOM
216222 if ($_ [1] =~ / ^\xFE\xFF / ) {
@@ -255,7 +261,8 @@ sub detect ($$;%) {
255261 # # Prescan xml
256262 if ($self -> {context } eq ' html' or
257263 $self -> {context } eq ' responsehtml' or
258- $self -> {context } eq ' xml' ) {
264+ $self -> {context } eq ' xml' or
265+ $self -> {context } eq ' any' ) {
259266 my $name = _prescan_xml $_ [1];
260267 if (defined $name ) {
261268 $self -> {encoding } = $name ;
@@ -271,7 +278,8 @@ sub detect ($$;%) {
271278
272279 # # Prescan html
273280 if ($self -> {context } eq ' html' or
274- $self -> {context } eq ' responsehtml' ) {
281+ $self -> {context } eq ' responsehtml' or
282+ $self -> {context } eq ' any' ) {
275283 my $name = _prescan_byte_stream $_ [1];
276284 if (defined $name ) {
277285 $self -> {encoding } = $name ;
@@ -285,7 +293,8 @@ sub detect ($$;%) {
285293 }
286294 }
287295
288- if ($self -> {context } eq ' css' ) {
296+ if ($self -> {context } eq ' css' or
297+ $self -> {context } eq ' any' ) {
289298 # # <https://drafts.csswg.org/css-syntax/#determine-the-fallback-encoding>
290299 if ($_ [1] =~ / \A\x40\x63\x68\x61\x72\x73\x65\x74\x20\x22 ([\x00 -\x21\x23 -\x7F ]*)\x22\x3B / ) {
291300 my $name = encoding_label_to_name $1 ;
@@ -318,17 +327,37 @@ sub detect ($$;%) {
318327 return ;
319328 }
320329
321- if ($self -> {context } eq ' html' or $self -> {context } eq ' text' ) {
322- # # UNIVCHARDET
323- require Web::Encoding::UnivCharDet;
324- my $det = Web::Encoding::UnivCharDet-> new;
325- # XXX locale-dependent configuration
326- my $name = encoding_label_to_name $det -> detect_byte_string ($_ [1]);
327- if ($name ) {
328- $self -> {encoding } = $name ;
329- delete $self -> {confident };
330- $self -> {source } = ' univchardet' ;
331- return ;
330+ if ($self -> {context } eq ' html' or
331+ $self -> {context } eq ' text' or
332+ $self -> {context } eq ' any' ) {
333+ # # Implementation-dependent detections
334+ {
335+ my $font_def ;
336+ if ($self -> {context } eq ' html' or $self -> {context } eq ' any' ) {
337+ $font_def = $self -> _detect_font ($_ [1]); # or undef
338+ }
339+
340+ # # UNIVCHARDET
341+ require Web::Encoding::UnivCharDet;
342+ my $det = Web::Encoding::UnivCharDet-> new;
343+ # XXX locale-dependent configuration
344+ my $got = $det -> detect_byte_string ($_ [1]);
345+ my $name = encoding_label_to_name $got ;
346+ if (defined $font_def ) {
347+ if (not defined $name or not $name eq ' utf-8' ) {
348+ $self -> {encoding } = ' windows-1252' ;
349+ delete $self -> {confident };
350+ $self -> {source } = ' font' ;
351+ $self -> {font_encoding } = $font_def -> {charset };
352+ return ;
353+ }
354+ }
355+ if (defined $name and not $got eq ' ascii' ) {
356+ $self -> {encoding } = $name ;
357+ delete $self -> {confident };
358+ $self -> {source } = ' univchardet' ;
359+ return ;
360+ }
332361 }
333362
334363 # # Locale
@@ -363,11 +392,176 @@ sub detect ($$;%) {
363392 return ;
364393} # detect
365394
395+ # XXX
396+ my $FontDefs = {
397+ " limon s1" => {charset => ' x-abc' },
398+ aniezhai => {charset => " x-aniezhai" },
399+ " adarshalipiexp" => {charset => " x-adarshalipiexp" },
400+ " adhawin-tamil" => {charset => " x-adhawin" },
401+ " adhawin-tamil regular" => {charset => " x-adhawin" },
402+ adhawintamil => {charset => " x-adhawin" },
403+ amudham2000 => {charset => " x-amudham2000" },
404+ " arial am" => {charset => " armscii-8" },
405+ " arial latarm" => {charset => " armscii-8" },
406+ au => {charset => " x-au" },
407+ bhaskar => {charset => " bhaskar" },
408+ chanakya => {charset => " x-chanakya" },
409+ eenadu => {charset => " x-eenadu" },
410+ epatrika => {charset => " x-epatrika" },
411+ rswwwnet => {charset => " georgian-academy" },
412+ trg1 => {charset => " georgian-academy" },
413+ " bpg classic dina" => {charset => " georgian-academy" },
414+ gopika => {charset => " x-gopika" },
415+ htchanakya => {charset => " htchanakya" },
416+ inaimathi => {charset => " x-inaimathi" },
417+ " inaimathi-1.8" => {charset => " x-inaimathi" },
418+ jagran => {charset => " jagran" },
419+ " ml-ttkarthika" => {charset => " x-karthika" },
420+ lokweb => {charset => " x-lokweb" },
421+ " lt-tm-barani" => {charset => " x-tam-lttmbarani" },
422+ " mac c swiss" => {charset => " x-mac-c-swiss" },
423+ " knw-ttnandi" => {charset => " x-nandi" },
424+ " utopic" => {charset => " x-utopic" },
425+ " unq_ttabid" => {charset => " x-pascii" },
426+ pothana => {charset => " x-pothana" },
427+ " sanskrit new" => {charset => " x-sanskrit-new" },
428+ " or-ttsarala" => {charset => " x-sarala" },
429+ " shree-mal-0502" => {charset => " x-shree-mal-0502" },
430+ " shree-tel-0900" => {charset => " x-shree-tel-0900" },
431+ shree802 => {charset => " x-shree802" },
432+ " subak-1" => {charset => " x-subak" },
433+ " dv-ttsurekh" => {charset => " x-surekh" },
434+ suritlr => {charset => " x-suritlr" },
435+ suritlk => {charset => " x-suritlk" },
436+ webtamil => {charset => " x-tam-webtamil" },
437+ " telugu lipi" => {charset => " x-telugu-lipi" },
438+ thoolika => {charset => " x-thoolika" },
439+ tikkana => {charset => " x-tikkana" },
440+ tboomis => {charset => " x-tam-tboomis" },
441+ tboomih => {charset => " x-tam-tboomis" },
442+ tboomi => {charset => " x-tam-tboomis" },
443+ tmnews => {charset => " x-tam-tmnews" },
444+ telugufont => {charset => " x-telugufont" },
445+ " tab-anna" => {charset => " tab" },
446+ " tab_inaimathi" => {charset => " tab" },
447+ " tab-lfs-kamban" => {charset => " tab" },
448+ ' tam-kalaignar' => {charset => " tam" },
449+ " tsc_janani" => {charset => " tscii" },
450+ " thunaivantsc" => {charset => " tscii" },
451+ " tscsaiindira" => {charset => " tscii" },
452+ " tscsaisai" => {charset => " tscii" },
453+ " tscarial" => {charset => " tscii" },
454+ " tsccomic" => {charset => " tscii" },
455+ " tscmylai" => {charset => " tscii" },
456+ " tsctimes" => {charset => " tscii" },
457+ " tscverdana" => {charset => " tscii" },
458+ " tsc_avarangal" => {charset => " tscii" },
459+ " tsc_avarangalfxd" => {charset => " tscii" },
460+ " tsc_kannadaasan" => {charset => " tscii" },
461+ " tsc_paranar" => {charset => " tscii" },
462+ " tsc_thunaivan" => {charset => " tscii" },
463+ " tsc-sri" => {charset => " tscii" },
464+ tscu_inaimathi => {charset => " tscii" },
465+ inaimathitsc => {charset => " tscii" },
466+ tneritsc => {charset => " tscii" },
467+ " perathanaitsc" => {charset => " tscii" },
468+ " aparanartsc" => {charset => " tscii" },
469+ " comictsc" => {charset => " tscii" },
470+ " maduramtsc" => {charset => " tscii" },
471+ " mylaifixtsc" => {charset => " tscii" },
472+ " mylaitsc" => {charset => " tscii" },
473+ " nanthinitsc" => {charset => " tscii" },
474+ " sri-tsc" => {charset => " tscii" },
475+ " timestsc" => {charset => " tscii" },
476+ " tneritsc" => {charset => " tscii" },
477+ " tamil_avarangal31tsc" => {charset => " tscii" },
478+ shivaji01 => {charset => " x-shivaji01" },
479+ vakil_01 => {charset => " x-vakil_01" },
480+ " .vntime" => {charset => " x-viet-tcvn" },
481+ " vntime" => {charset => " x-viet-tcvn" },
482+ " vni-aptima" => {charset => " x-viet-vni" },
483+ " vni-helve" => {charset => " x-viet-vni" },
484+ " vni-times" => {charset => " x-viet-vni" },
485+ " vni-internet mail" => {charset => " x-viet-vni" },
486+ " vni couri" => {charset => " x-viet-vni" },
487+ " vps times" => {charset => " x-viet-vps" },
488+ vikatan => {charset => " x-vikatan" },
489+ webdunia => {charset => " x-webdunia" },
490+ xdvng => {charset => ' x-xdvng' },
491+ };
492+
493+ sub _detect_font ($$) {
494+ my $self = shift ;
495+ $self -> {fonts } = {};
496+
497+ # 1.
498+ (pos $_ [0]) = 0;
499+
500+ my $count = 0;
501+ # 2.
502+ LOOP: {
503+ $_ [0] =~ / \G <!--+>/gc ;
504+ $_ [0] =~ / \G <!--.*?-->/gcs ;
505+ if ($_ [0] =~ / \G <[Ff][Oo][Nn][Tt](?=[\x09\x0A\x0C\x0D\x20\x2F ])/gc ) {
506+ # 1.
507+ #
508+
509+ # 2.-5.
510+ my $attr_list = {};
511+
512+ # 6.
513+ ATTRS: {
514+ my $attr = _get_attr ($_ [0]) or last ATTRS;
515+
516+ # 7.
517+ redo ATTRS if $attr_list -> {$attr -> {name }};
518+
519+ # 8.
520+ $attr_list -> {$attr -> {name }} = $attr ;
521+
522+ # 9.
523+ if ($attr -> {name } eq ' face' ) {
524+ my $attr_value = $attr -> {value };
525+ $attr_value =~ s /\A [\x09\x0A\x0C\x0D\x20 ]+// ;
526+ $attr_value =~ s / [\x09\x0A\x0C\x0D\x20 ]+\z // ;
527+ $attr_value =~ tr / A-Z/ a-z/ ;
528+ $self -> {fonts }-> {$_ }++ for split /[\x 09 \x0A\x0C\x0D\x 20 ]*,[\x 09 \x0A\x0C\x0D\x 20 ]*/, $attr_value ;
529+ last LOOP if $count ++ > 10;
530+ }
531+
532+ # 10.
533+ last LOOP if pos $_ [0] >= length $_ [0];
534+ redo ATTRS;
535+ } # ATTRS
536+ } elsif ($_ [0] =~ m {\G </?[A-Za-z][^\x09\x0A\x0C\x0D\x20 >]*} gc ) {
537+ {
538+ _get_attr ($_ [0]) and redo ;
539+ }
540+ } elsif ($_ [0] =~ m {\G <[!/?][^>]*} gc ) {
541+ #
542+ }
543+
544+ # 3. Next byte
545+ $_ [0] =~ / \G [^<]+/gc || $_ [0] =~ / \G </gc ;
546+ last LOOP if pos $_ [0] >= length $_ [0];
547+ redo LOOP;
548+ } # LOOP
549+
550+ for my $font_name (sort { $self -> {fonts }-> {$b } <=> $self -> {fonts }-> {$a } } keys %{$self -> {fonts }}) {
551+ my $f = $FontDefs -> {$font_name };
552+ if (defined $f ) {
553+ return $f ;
554+ }
555+ }
556+
557+ return undef ;
558+ } # _detect_font
559+
3665601;
367561
368562=head1 LICENSE
369563
370- Copyright 2007-2017 Wakaba <wakaba@suikawiki.org>.
564+ Copyright 2007-2025 Wakaba <wakaba@suikawiki.org>.
371565
372566This library is free software; you can redistribute it and/or modify
373567it under the same terms as Perl itself.
0 commit comments