@@ -9,7 +9,7 @@ sub new ($;%) {
99 my %args = @_ ;
1010
1111 $self -> {filter } = {ja => 1, zh_hant => 1, zh_hans => 1, ko => 1, non_cjk => 1};
12- $self -> {filter }-> {utf32 } = 1 if $args {utf32 };
12+ $self -> {filter }-> {utf } = 1 if $args {utf };
1313
1414 return $self ;
1515} # new
@@ -23,7 +23,7 @@ sub _detector ($) {
2323 $filter |= Web::Encoding::UnivCharDet::Defs::FILTER_KOREAN () if $_ [0]-> {filter }-> {ko };
2424 $filter |= Web::Encoding::UnivCharDet::Defs::FILTER_NON_CJK () if $_ [0]-> {filter }-> {non_cjk };
2525 my $x = Web::Encoding::UnivCharDet::UniversalDetector-> new ($filter );
26- $x -> {utf32 } = 1 if $_ [0]-> {filter }-> {utf32 };
26+ $x -> {utf } = 1 if $_ [0]-> {filter }-> {utf };
2727 $x ;
2828 };
2929} # _detector
@@ -48,11 +48,11 @@ sub _dump ($) {
4848package Web::Encoding::UnivCharDet::UniversalDetector ;
4949our $VERSION = ' 1.0' ;
5050use Web::Encoding::UnivCharDet::CharsetProber;
51+ use Web::Encoding::UnivCharDet::UTFCharsetProber;
5152
5253sub new ($$) {
5354 my $self = bless {
5455 lang_filter => $_ [1],
55- charset_probers => [],
5656 }, $_ [0];
5757 $self -> reset ;
5858 return $self ;
@@ -68,8 +68,9 @@ sub reset ($) {
6868 $self -> {got_data } = undef ;
6969 $self -> {input_state } = ' pure ascii' ;
7070 $self -> {last_char } = 0x00;
71- $self -> {esc_charset_prober }-> reset if $self -> {esc_charset_prober };
72- $_ -> reset for grep { $_ } @{$self -> {charset_probers }};
71+ $self -> {charset_probers } = [];
72+ delete $self -> {esc_charset_prober };
73+ delete $self -> {utf1632_prober };
7374} # reset
7475
7576sub handle_data ($$) {
@@ -88,7 +89,7 @@ sub handle_data ($$) {
8889 $self -> {detected_charset } = ' utf-16le' ;
8990 }
9091
91- if ($self -> {utf32 }) {
92+ if ($self -> {utf }) {
9293 # # <https://github.com/mozilla/gecko-dev/commit/68332f717f14e8f2467ca4f2c521ed8fe6eff71d>
9394 if ($_ [1] =~ / ^\xFE\xFF\x00\x00 / ) {
9495 $self -> {detected_charset } = ' x-iso-10646-ucs-4-3412' ;
@@ -107,16 +108,20 @@ sub handle_data ($$) {
107108 }
108109 } # start
109110
110- for my $i (0..((length $_ [1]) - 1)) {
111+ my $length = length $_ [1];
112+ my $zero = 0;
113+ for my $i (0..($length - 1)) {
111114 my $c = ord substr $_ [1], $i , 1;
115+ $zero ++ if $c == 0x00;
112116 if ($c & 0x80 and $c != 0xA0) {
113117 if ($self -> {input_state } ne ' high byte' ) {
114118 $self -> {input_state } = ' high byte' ;
115- delete $self -> {esc_charset_prober } if $self -> {esc_charset_prober };
119+ delete $self -> {esc_charset_prober };
120+ delete $self -> {utf1632_prober };
116121
117122 $self -> {charset_probers }-> [0]
118123 ||= Web::Encoding::UnivCharDet::CharsetProber::MBCSGroup-> new
119- ($self -> {lang_filter });
124+ ($self -> {lang_filter });
120125 $self -> {charset_probers }-> [1]
121126 ||= Web::Encoding::UnivCharDet::CharsetProber::SBCSGroup-> new
122127 if $self -> {lang_filter } & Web::Encoding::UnivCharDet::Defs::FILTER_NON_CJK;
@@ -133,51 +138,75 @@ sub handle_data ($$) {
133138 }
134139 } # $i
135140
141+ if ($self -> {utf } and $zero ) {
142+ if ($zero / ($length || 1) > 0.1) { # random threshold
143+ $self -> {charset_probers } = [];
144+ }
145+ $self -> {utf1632_prober } ||= Web::Encoding::UnivCharDet::UTFCharsetProber-> new;
146+ }
147+ if (defined $self -> {utf1632_prober }) {
148+ {
149+ my $st = $self -> {utf1632_prober }-> handle_data ($_ [1]);
150+ if ($st eq ' found it' ) {
151+ $self -> {done } = 1;
152+ $self -> {detected_charset } = $self -> {utf1632_prober }-> get_charset_name; # non-undef when found
153+ return 1;
154+ }
155+ }
156+ }
157+
136158 if ($self -> {input_state } eq ' esc ascii' ) {
137159 $self -> {esc_charset_prober }
138160 ||= Web::Encoding::UnivCharDet::CharsetProber::ESC-> new
139- ($self -> {lang_filter });
140- my $st = $self -> {esc_charset_prober }-> handle_data ($_ [1]);
141- if ($st eq ' found it' ) {
142- $self -> {done } = 1;
143- $self -> {detected_charset } = $self -> {esc_charset_prober }-> get_charset_name;
161+ ($self -> {lang_filter });
162+ {
163+ my $st = $self -> {esc_charset_prober }-> handle_data ($_ [1]);
164+ if ($st eq ' found it' ) {
165+ $self -> {done } = 1;
166+ $self -> {detected_charset } = $self -> {esc_charset_prober }-> get_charset_name; # non-undef when found
167+ return 1;
168+ }
144169 }
145170 } elsif ($self -> {input_state } eq ' high byte' ) {
146- for (grep { $_ } @{$self -> {charset_probers }}) {
171+ for (grep { defined $_ } @{$self -> {charset_probers }}) {
147172 my $st = $_ -> handle_data ($_ [1]);
148173 if ($st eq ' found it' ) {
149174 $self -> {done } = 1;
150- $self -> {detected_charset } = $_ -> get_charset_name;
175+ $self -> {detected_charset } = $_ -> get_charset_name; # non-undef when found
151176 return 1;
152177 }
153178 }
154179 }
155-
180+
156181 return 1;
157182} # handle_data
158183
159184sub data_end ($) {
160185 my $self = $_ [0];
161186 return unless $self -> {got_data };
162187
163- if ($self -> {detected_charset }) {
188+ if (defined $self -> {detected_charset }) {
164189 $self -> {done } = 1;
165190 $self -> {reported } = $self -> {detected_charset };
166191 return ;
167192 }
168193
194+ if (defined $self -> {utf1632_prober }) {
195+ $self -> {reported } = $self -> {utf1632_prober }-> get_charset_name; # or undef
196+ }
197+
169198 if ($self -> {input_state } eq ' high byte' ) {
170199 my $max_prober_confidence = 0.0;
171200 my $max_prober ;
172- for (grep { $_ } @{$self -> {charset_probers }}) {
201+ for (grep { defined $_ } @{$self -> {charset_probers }}) {
173202 my $prober_confidence = $_ -> get_confidence;
174203 if ($prober_confidence > $max_prober_confidence ) {
175204 $max_prober_confidence = $prober_confidence ;
176205 $max_prober = $_ ;
177206 }
178207 }
179208 if ($max_prober_confidence > Web::Encoding::UnivCharDet::Defs::MINIMUM_THRESHOLD) {
180- $self -> {reported } = $max_prober -> get_charset_name;
209+ $self -> {reported } = $max_prober -> get_charset_name; # or undef (but unlikely?)
181210 }
182211 }
183212} # data_end
@@ -187,7 +216,13 @@ sub get_reported_charset ($) {
187216} # get_reported_charset
188217
189218sub dump_status ($) {
190- $_ -> dump_status for grep { $_ } @{$_ [0]-> {charset_probers }};
219+ my $self = $_ [0];
220+ print " Input state: $self ->{input_state}\n " ;
221+ $_ -> dump_status for grep { defined $_ }
222+ @{$self -> {charset_probers }},
223+ $self -> {esc_charset_prober },
224+ $self -> {utf1632_prober };
225+ print " Reported: @{[$self ->{reported} // '']}\n " ;
191226} # dump_status
192227
1932281;
0 commit comments