@@ -74,6 +74,9 @@ sub reset ($) {
7474 delete $self -> {esc_charset_prober };
7575 delete $self -> {utf1632_prober };
7676 delete $self -> {reported };
77+ # delete $self->{nbsp_found};
78+ delete $self -> {esc_found };
79+ delete $self -> {binary_found };
7780} # reset
7881
7982sub handle_data ($$) {
@@ -116,6 +119,9 @@ sub handle_data ($$) {
116119 for my $i (0..($length - 1)) {
117120 my $c = ord substr $_ [1], $i , 1;
118121 $zero ++ if $c == 0x00;
122+ # if ($c == 0xA0) {
123+ # $self->{nbsp_found} = 1;
124+ # } elsif ($c & 0x80) {
119125 if ($c & 0x80 and $c != 0xA0) {
120126 if ($self -> {input_state } ne ' high byte' ) {
121127 $self -> {input_state } = ' high byte' ;
@@ -133,10 +139,18 @@ sub handle_data ($$) {
133139 unless $self -> {lang_filter } & Web::Encoding::UnivCharDet::Defs::FILTER_NON_CJK;
134140 }
135141 } else {
136- if ($self -> {input_state } eq ' pure ascii' and
137- $c == 0x1B or
138- ($c == 0x7B and $self -> {last_char } == 0x7E)) { # ~{
139- $self -> {input_state } = ' esc ascii' ;
142+ if ($self -> {input_state } eq ' pure ascii' ) {
143+ if ($c == 0x1B or $c == 0x0E or $c == 0x0F) {
144+ $self -> {input_state } = ' esc ascii' ;
145+ $self -> {esc_found } = 1;
146+ } elsif ($c == 0x7B and $self -> {last_char } == 0x7E) { # ~{
147+ $self -> {input_state } = ' esc ascii' ;
148+ } elsif ((0x00 <= $c and $c <= 0x07) or
149+ (0x10 <= $c and $c <= 0x19) or
150+ (0x1C <= $c and $c <= 0x1F) or
151+ $c == 0x7F) {
152+ $self -> {binary_found } = 1;
153+ }
140154 }
141155 $self -> {last_char } = $c ;
142156 }
@@ -212,6 +226,17 @@ sub data_end ($) {
212226 if ($max_prober_confidence > Web::Encoding::UnivCharDet::Defs::MINIMUM_THRESHOLD) {
213227 $self -> {reported } = $max_prober -> get_charset_name; # or undef (but unlikely?)
214228 }
229+ } elsif ($self -> {input_state } eq ' pure ascii' or
230+ $self -> {input_state } eq ' esc ascii' ) {
231+ if ($self -> {esc_found }) {
232+ #
233+ } elsif ($self -> {binary_found }) {
234+ #
235+ # } elsif ($self->{nbsp_found}) {
236+ # $self->{reported} = 'windows-1252';
237+ } else {
238+ $self -> {reported } = ' windows-1252' ; # ascii
239+ }
215240 }
216241} # data_end
217242
0 commit comments