Skip to content

Commit 3f302bc

Browse files
committed
Report plain ASCII as such if possible
1 parent 3d2a8c3 commit 3f302bc

File tree

1 file changed

+29
-4
lines changed

1 file changed

+29
-4
lines changed

lib/Web/Encoding/UnivCharDet.pm

Lines changed: 29 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -74,6 +74,9 @@ sub reset ($) {
7474
delete $self->{esc_charset_prober};
7575
delete $self->{utf1632_prober};
7676
delete $self->{reported};
77+
#delete $self->{nbsp_found};
78+
delete $self->{esc_found};
79+
delete $self->{binary_found};
7780
} # reset
7881

7982
sub handle_data ($$) {
@@ -116,6 +119,9 @@ sub handle_data ($$) {
116119
for my $i (0..($length - 1)) {
117120
my $c = ord substr $_[1], $i, 1;
118121
$zero++ if $c == 0x00;
122+
#if ($c == 0xA0) {
123+
# $self->{nbsp_found} = 1;
124+
#} elsif ($c & 0x80) {
119125
if ($c & 0x80 and $c != 0xA0) {
120126
if ($self->{input_state} ne 'high byte') {
121127
$self->{input_state} = 'high byte';
@@ -133,10 +139,18 @@ sub handle_data ($$) {
133139
unless $self->{lang_filter} & Web::Encoding::UnivCharDet::Defs::FILTER_NON_CJK;
134140
}
135141
} else {
136-
if ($self->{input_state} eq 'pure ascii' and
137-
$c == 0x1B or
138-
($c == 0x7B and $self->{last_char} == 0x7E)) { # ~{
139-
$self->{input_state} = 'esc ascii';
142+
if ($self->{input_state} eq 'pure ascii') {
143+
if ($c == 0x1B or $c == 0x0E or $c == 0x0F) {
144+
$self->{input_state} = 'esc ascii';
145+
$self->{esc_found} = 1;
146+
} elsif ($c == 0x7B and $self->{last_char} == 0x7E) { # ~{
147+
$self->{input_state} = 'esc ascii';
148+
} elsif ((0x00 <= $c and $c <= 0x07) or
149+
(0x10 <= $c and $c <= 0x19) or
150+
(0x1C <= $c and $c <= 0x1F) or
151+
$c == 0x7F) {
152+
$self->{binary_found} = 1;
153+
}
140154
}
141155
$self->{last_char} = $c;
142156
}
@@ -212,6 +226,17 @@ sub data_end ($) {
212226
if ($max_prober_confidence > Web::Encoding::UnivCharDet::Defs::MINIMUM_THRESHOLD) {
213227
$self->{reported} = $max_prober->get_charset_name; # or undef (but unlikely?)
214228
}
229+
} elsif ($self->{input_state} eq 'pure ascii' or
230+
$self->{input_state} eq 'esc ascii') {
231+
if ($self->{esc_found}) {
232+
#
233+
} elsif ($self->{binary_found}) {
234+
#
235+
#} elsif ($self->{nbsp_found}) {
236+
# $self->{reported} = 'windows-1252';
237+
} else {
238+
$self->{reported} = 'windows-1252'; # ascii
239+
}
215240
}
216241
} # data_end
217242

0 commit comments

Comments
 (0)