Skip to content

Commit c0dc433

Browse files
committed
Updated language model data with support for OEM code pages and Macintosh code sets detections; Small adjustments for SBCS/MBCS detections
1 parent 2cbab79 commit c0dc433

File tree

5 files changed

+1068
-45
lines changed

5 files changed

+1068
-45
lines changed

lib/Web/Encoding/UnivCharDet/CharsetProber.pm

Lines changed: 58 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -251,8 +251,10 @@ sub reset ($;$) {
251251
] : [
252252
Web::Encoding::UnivCharDet::CharsetProber::Latin1->new, # [0]
253253
map { Web::Encoding::UnivCharDet::CharsetProber::SBCS->new ($_) }
254-
$Web::Encoding::UnivCharDet::Defs::Windows_1250CzechModel, # [1]
255-
$Web::Encoding::UnivCharDet::Defs::MacRomanSpanishModel, # [2]
254+
# $Web::Encoding::UnivCharDet::Defs::Windows_1250CzechModel, # [1]
255+
$Web::Encoding::UnivCharDet::Defs::Windows_1250CentralModel, # [1]
256+
# $Web::Encoding::UnivCharDet::Defs::MacRomanSpanishModel, # [2]
257+
$Web::Encoding::UnivCharDet::Defs::MacintoshWesternModel, # [2]
256258
$Web::Encoding::UnivCharDet::Defs::Win1251Model,
257259
$Web::Encoding::UnivCharDet::Defs::Koi8rModel,
258260
$Web::Encoding::UnivCharDet::Defs::Iso_8859_5Model,
@@ -340,49 +342,68 @@ sub handle_data ($$) {
340342
my $old_prober_count = @{$self->{probers}};
341343
my @new_prober = $self->{resolve_latin1_refs} ? (
342344
map { Web::Encoding::UnivCharDet::CharsetProber::SBCS->new ($_) }
343-
$Web::Encoding::UnivCharDet::Defs::Windows_1252FrenchModel,
344-
$Web::Encoding::UnivCharDet::Defs::Windows_1252SpanishModel,
345-
$Web::Encoding::UnivCharDet::Defs::Windows_1252PortugueseModel,
346-
$Web::Encoding::UnivCharDet::Defs::Windows_1252GermanModel,
347-
$Web::Encoding::UnivCharDet::Defs::Iso_8859_4EstonianModel,
348-
$Web::Encoding::UnivCharDet::Defs::Iso_8859_4LatvianModel,
345+
$Web::Encoding::UnivCharDet::Defs::Windows_1252WesternModel,
346+
$Web::Encoding::UnivCharDet::Defs::Windows_1252ScandinavianModel,
347+
#$Web::Encoding::UnivCharDet::Defs::Windows_1252FrenchModel,
348+
#$Web::Encoding::UnivCharDet::Defs::Windows_1252SpanishModel,
349+
#$Web::Encoding::UnivCharDet::Defs::Windows_1252PortugueseModel,
350+
#$Web::Encoding::UnivCharDet::Defs::Windows_1252GermanModel,
351+
$Web::Encoding::UnivCharDet::Defs::Iso_8859_4BalticModel,
352+
# $Web::Encoding::UnivCharDet::Defs::Iso_8859_4EstonianModel,
353+
# $Web::Encoding::UnivCharDet::Defs::Iso_8859_4LatvianModel,
349354
$Web::Encoding::UnivCharDet::Defs::Iso_8859_3EsperantoModel,
350355
) : (
351356
map { Web::Encoding::UnivCharDet::CharsetProber::SBCS->new ($_) }
352-
$Web::Encoding::UnivCharDet::Defs::Windows_1252FrenchModel,
353-
$Web::Encoding::UnivCharDet::Defs::Windows_1252SpanishModel,
354-
$Web::Encoding::UnivCharDet::Defs::Windows_1252PortugueseModel,
355-
$Web::Encoding::UnivCharDet::Defs::Windows_1252GermanModel,
357+
$Web::Encoding::UnivCharDet::Defs::Windows_1252WesternModel,
358+
$Web::Encoding::UnivCharDet::Defs::Windows_1252ScandinavianModel,
359+
#$Web::Encoding::UnivCharDet::Defs::Windows_1252FrenchModel,
360+
#$Web::Encoding::UnivCharDet::Defs::Windows_1252SpanishModel,
361+
#$Web::Encoding::UnivCharDet::Defs::Windows_1252PortugueseModel,
362+
#$Web::Encoding::UnivCharDet::Defs::Windows_1252GermanModel,
356363
#$Web::Encoding::UnivCharDet::Defs::Iso_8859_2HungarianModel,
357-
$Web::Encoding::UnivCharDet::Defs::Iso_8859_2CroatianModel,
358-
$Web::Encoding::UnivCharDet::Defs::Iso_8859_2PolishModel,
359-
$Web::Encoding::UnivCharDet::Defs::Iso_8859_2CzechModell,
364+
$Web::Encoding::UnivCharDet::Defs::Iso_8859_2CentralModel,
365+
# $Web::Encoding::UnivCharDet::Defs::Iso_8859_2CroatianModel,
366+
# $Web::Encoding::UnivCharDet::Defs::Iso_8859_2PolishModel,
367+
# $Web::Encoding::UnivCharDet::Defs::Iso_8859_2CzechModell,
360368
#$Web::Encoding::UnivCharDet::Defs::Windows_1250HungarianModel,
361-
$Web::Encoding::UnivCharDet::Defs::Windows_1250CroatianModel,
362-
$Web::Encoding::UnivCharDet::Defs::Windows_1250PolishModel,
369+
# $Web::Encoding::UnivCharDet::Defs::Windows_1250CroatianModel,
370+
# $Web::Encoding::UnivCharDet::Defs::Windows_1250PolishModel,
363371
#$Web::Encoding::UnivCharDet::Defs::Windows_1250CzechModel,
364-
$Web::Encoding::UnivCharDet::Defs::Windows_1257EstonianModel,
365-
$Web::Encoding::UnivCharDet::Defs::Windows_1257LithuanianModel,
366-
$Web::Encoding::UnivCharDet::Defs::Windows_1257LatvianModel,
367-
$Web::Encoding::UnivCharDet::Defs::Iso_8859_13LatvianModel,
368-
$Web::Encoding::UnivCharDet::Defs::Iso_8859_4EstonianModel,
369-
$Web::Encoding::UnivCharDet::Defs::Iso_8859_4LatvianModel,
372+
$Web::Encoding::UnivCharDet::Defs::Windows_1257BalticModel,
373+
# $Web::Encoding::UnivCharDet::Defs::Windows_1257EstonianModel,
374+
# $Web::Encoding::UnivCharDet::Defs::Windows_1257LithuanianModel,
375+
# $Web::Encoding::UnivCharDet::Defs::Windows_1257LatvianModel,
376+
$Web::Encoding::UnivCharDet::Defs::Iso_8859_13BalticModel,
377+
# $Web::Encoding::UnivCharDet::Defs::Iso_8859_13LatvianModel,
378+
# $Web::Encoding::UnivCharDet::Defs::Iso_8859_4EstonianModel,
379+
# $Web::Encoding::UnivCharDet::Defs::Iso_8859_4LatvianModel,
370380
$Web::Encoding::UnivCharDet::Defs::Iso_8859_3EsperantoModel,
371-
$Web::Encoding::UnivCharDet::Defs::Iso_8859_10LithuanianModel,
381+
# $Web::Encoding::UnivCharDet::Defs::Iso_8859_10LithuanianModel,
372382
$Web::Encoding::UnivCharDet::Defs::Iso_8859_15EstonianModel,
373383
$Web::Encoding::UnivCharDet::Defs::Iso_8859_15FrenchModel,
374-
$Web::Encoding::UnivCharDet::Defs::Iso_8859_16RomanianModel,
384+
# $Web::Encoding::UnivCharDet::Defs::Iso_8859_16RomanianModel,
375385
#$Web::Encoding::UnivCharDet::Defs::Iso_8859_9TurkishModel,
376386
$Web::Encoding::UnivCharDet::Defs::Windows_1254TurkishModel,
377-
#$Web::Encoding::UnivCharDet::Defs::Windows_1258VietnameseModel,
378-
$Web::Encoding::UnivCharDet::Defs::Mac_CentraleuropeCzechModel,
379-
$Web::Encoding::UnivCharDet::Defs::Ibm852CzechModel,
380-
$Web::Encoding::UnivCharDet::Defs::Ibm852PolishModel,
387+
$Web::Encoding::UnivCharDet::Defs::Ibm437WesternModel,
388+
$Web::Encoding::UnivCharDet::Defs::Ibm850WesternModel,
389+
$Web::Encoding::UnivCharDet::Defs::Ibm850ScandinavianModel,
390+
$Web::Encoding::UnivCharDet::Defs::Ibm852CentralModel,
391+
# $Web::Encoding::UnivCharDet::Defs::Ibm852CzechModel,
392+
# $Web::Encoding::UnivCharDet::Defs::Ibm852PolishModel,
393+
$Web::Encoding::UnivCharDet::Defs::Ibm775BalticModel,
394+
$Web::Encoding::UnivCharDet::Defs::Ibm857TurkishModel,
381395
$Web::Encoding::UnivCharDet::Defs::Ibm865DanishModel,
382396
$Web::Encoding::UnivCharDet::Defs::Windows_1252IcelandicFaroeseModel,
397+
$Web::Encoding::UnivCharDet::Defs::MacintoshScandinavianModel,
398+
$Web::Encoding::UnivCharDet::Defs::X_Mac_CeCentralModel,
399+
# $Web::Encoding::UnivCharDet::Defs::Mac_CentraleuropeCzechModel,
400+
$Web::Encoding::UnivCharDet::Defs::Iso_8859_4BalticModel,
401+
$Web::Encoding::UnivCharDet::Defs::Iso_8859_10BalticModel,
402+
$Web::Encoding::UnivCharDet::Defs::Iso_8859_16CentralModel,
383403

384404
#$Web::Encoding::UnivCharDet::Defs::Win1250HungarianModel,
385405
#$Web::Encoding::UnivCharDet::Defs::Latin2HungarianModel,
406+
#$Web::Encoding::UnivCharDet::Defs::Windows_1258VietnameseModel,
386407
);
387408
push @{$self->{probers}}, @new_prober;
388409
$self->{active_num} += @new_prober;
@@ -465,7 +486,7 @@ sub dump_status ($) {
465486
$_->dump_status if defined $_;
466487
}
467488
for (@{$self->{inactive_probers}}) {
468-
print " [inactive]";
489+
print " [in]";
469490
$_->dump_status;
470491
}
471492
} # dump_status
@@ -507,7 +528,10 @@ sub SYM () { 253 }
507528
sub RET () { 252 }
508529
sub NUM () { 251 }
509530
sub CPY () { 250 }
510-
sub SYMBOL_CAT_ORDER () { 249 }
531+
sub TMK () { 249 }
532+
sub ORD () { 248 }
533+
sub DLM () { 247 }
534+
sub SYMBOL_CAT_ORDER () { 246 }
511535

512536
sub new ($$;$$) {
513537
my $self = bless {}, $_[0];
@@ -574,13 +598,13 @@ sub handle_data ($$) {
574598
$self->{seq_counters}->[NEGATIVE_CAT]++;
575599
$self->{total_seqs}++;
576600
}
577-
} elsif ($order == SYM) {
601+
} elsif ($order == SYM or $order == DLM or $order == TMK or $order == ORD) {
578602
$self->{seq_counters}->[SYM_CAT]++;
579603
if ($self->{symbol_state} == 1) {
580604
$self->{seq_counters}->[CPY_CAT]++;
581605
}
582606
} elsif ($order == CPY) {
583-
if ($self->{last_order} == SYM or $self->{last_order} == 255) {
607+
if ($self->{last_order} == DLM or $self->{last_order} == 255) {
584608
$self->{seq_counters}->[SYM_CAT]++;
585609
$self->{symbol_state} = 1;
586610

@@ -1027,7 +1051,7 @@ sub dump_status ($) {
10271051
my $self = $_[0];
10281052
$self->get_confidence;
10291053
printf " MBCS [%s] %s [%s] (%s)\n",
1030-
$self->get_charset_name,
1054+
$self->get_charset_name // '',
10311055
$self->{resolve_latin1_refs} ? 'htmlrefs' : '',
10321056
$self->get_confidence,
10331057
$self->{state};

lib/Web/Encoding/UnivCharDet/CodingStateMachine.pm

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -35,7 +35,7 @@ my $Latin1Type = [
3535
4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 1, 0, 1, 0, 0,
3636
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3737
0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
38-
0, 0, 0, 0, 0, 0, 6, 5, 5, 2, 5, 5, 5, 5, 5, 5,
38+
0, 7, 0, 7, 7, 0, 6, 5, 5, 2, 5, 5, 5, 5, 5, 5,
3939
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
4040
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
4141
6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5,
@@ -62,6 +62,9 @@ sub next_state ($$) {
6262
$self->{current_byte_pos}++;
6363
if ($self->{latin1_state} == 1 and $Latin1Type->[$cc] == 2) {
6464
$self->{latin1_state} = 2;
65+
} elsif ($self->{latin1_state} == 1 and $Latin1Type->[$cc] == 7) {
66+
$self->{latin1_count}++;
67+
$self->{latin1_state} = 0;
6568
} elsif ($self->{latin1_state} == 2 and
6669
($Latin1Type->[$cc] == 1 or
6770
$Latin1Type->[$cc] == 3 or

0 commit comments

Comments
 (0)