|
| 1 | +package Web::Encoding::UnivCharDet::MacCharsetProber; |
| 2 | +use strict; |
| 3 | +use warnings; |
| 4 | +our $VERSION = '1.0'; |
| 5 | +use Web::Encoding::UnivCharDet::CharsetProber; |
| 6 | + |
| 7 | +package Web::Encoding::UnivCharDet::MacCharsetProber::MacRoman; |
| 8 | +push our @ISA, qw(Web::Encoding::UnivCharDet::CharsetProber); |
| 9 | +our $VERSION = '1.0'; |
| 10 | + |
| 11 | +sub FREQ_CAT_NUM () { 4 } |
| 12 | +sub UDF () { 0 } |
| 13 | +sub OTH () { 1 } |
| 14 | +sub ASC () { 2 } |
| 15 | +sub ASS () { 3 } |
| 16 | +sub ACV () { 4 } |
| 17 | +sub ACO () { 5 } |
| 18 | +sub ASV () { 6 } |
| 19 | +sub ASO () { 7 } |
| 20 | +sub ODD () { 8 } |
| 21 | +sub CLASS_NUM () { 9 } |
| 22 | + |
| 23 | +# The change from Latin1 is that we explicitly look for extended characters |
| 24 | +# that are infrequently-occurring symbols, and consider them to always be |
| 25 | +# improbable. This should let MacRoman get out of the way of more likely |
| 26 | +# encodings in most situations. |
| 27 | + |
| 28 | +my $MacRoman_CharToClass = [ |
| 29 | + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 00 - 07 |
| 30 | + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 08 - 0F |
| 31 | + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 10 - 17 |
| 32 | + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 18 - 1F |
| 33 | + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 20 - 27 |
| 34 | + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 28 - 2F |
| 35 | + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 30 - 37 |
| 36 | + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # 38 - 3F |
| 37 | + OTH, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 40 - 47 |
| 38 | + ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 48 - 4F |
| 39 | + ASC, ASC, ASC, ASC, ASC, ASC, ASC, ASC, # 50 - 57 |
| 40 | + ASC, ASC, ASC, OTH, OTH, OTH, OTH, OTH, # 58 - 5F |
| 41 | + OTH, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 60 - 67 |
| 42 | + ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 68 - 6F |
| 43 | + ASS, ASS, ASS, ASS, ASS, ASS, ASS, ASS, # 70 - 77 |
| 44 | + ASS, ASS, ASS, OTH, OTH, OTH, OTH, OTH, # 78 - 7F |
| 45 | + ACV, ACV, ACO, ACV, ACO, ACV, ACV, ASV, # 80 - 87 |
| 46 | + ASV, ASV, ASV, ASV, ASV, ASO, ASV, ASV, # 88 - 8F |
| 47 | + ASV, ASV, ASV, ASV, ASV, ASV, ASO, ASV, # 90 - 97 |
| 48 | + ASV, ASV, ASV, ASV, ASV, ASV, ASV, ASV, # 98 - 9F |
| 49 | + OTH, OTH, OTH, OTH, OTH, OTH, OTH, ASO, # A0 - A7 |
| 50 | + OTH, OTH, ODD, ODD, OTH, OTH, ACV, ACV, # A8 - AF |
| 51 | + OTH, OTH, OTH, OTH, OTH, OTH, OTH, OTH, # B0 - B7 |
| 52 | + OTH, OTH, OTH, OTH, OTH, OTH, ASV, ASV, # B8 - BF |
| 53 | + OTH, OTH, ODD, OTH, ODD, OTH, OTH, OTH, # C0 - C7 |
| 54 | + OTH, OTH, OTH, ACV, ACV, ACV, ACV, ASV, # C8 - CF |
| 55 | + OTH, OTH, OTH, OTH, OTH, OTH, OTH, ODD, # D0 - D7 |
| 56 | + ASV, ACV, ODD, OTH, OTH, OTH, OTH, OTH, # D8 - DF |
| 57 | + OTH, OTH, OTH, OTH, OTH, ACV, ACV, ACV, # E0 - E7 |
| 58 | + ACV, ACV, ACV, ACV, ACV, ACV, ACV, ACV, # E8 - EF |
| 59 | + ODD, ACV, ACV, ACV, ACV, ASV, ODD, ODD, # F0 - F7 |
| 60 | + ODD, ODD, ODD, ODD, ODD, ODD, ODD, ODD, # F8 - FF |
| 61 | +]; |
| 62 | + |
| 63 | +# 0 : illegal |
| 64 | +# 1 : very unlikely |
| 65 | +# 2 : normal |
| 66 | +# 3 : very likely |
| 67 | +my $MacRomanClassModel = [ |
| 68 | +# UDF OTH ASC ASS ACV ACO ASV ASO ODD |
| 69 | + 0, 0, 0, 0, 0, 0, 0, 0, 0, # UDF |
| 70 | + 0, 3, 3, 3, 3, 3, 3, 3, 1, # OTH |
| 71 | + 0, 3, 3, 3, 3, 3, 3, 3, 1, # ASC |
| 72 | + 0, 3, 3, 3, 1, 1, 3, 3, 1, # ASS |
| 73 | + 0, 3, 3, 3, 1, 2, 1, 2, 1, # ACV |
| 74 | + 0, 3, 3, 3, 3, 3, 3, 3, 1, # ACO |
| 75 | + 0, 3, 1, 3, 1, 1, 1, 3, 1, # ASV |
| 76 | + 0, 3, 1, 3, 1, 1, 3, 3, 1, # ASO |
| 77 | + 0, 1, 1, 1, 1, 1, 1, 1, 1, # ODD |
| 78 | +]; |
| 79 | + |
| 80 | +sub new ($) { |
| 81 | + my $self = bless {}, $_[0]; |
| 82 | + $self->reset; |
| 83 | + return $self; |
| 84 | +} # new |
| 85 | + |
| 86 | +sub reset ($) { |
| 87 | + my $self = $_[0]; |
| 88 | + $self->{state} = 'detecting'; |
| 89 | + $self->{last_char_class} = OTH; |
| 90 | + $self->{freq_counter}->[$_] = 0 for 0..(FREQ_CAT_NUM - 1); |
| 91 | + |
| 92 | + # express the prior that MacRoman is a somewhat rare encoding; this |
| 93 | + # can be done by starting out in a slightly improbable state that |
| 94 | + # must be overcome |
| 95 | + $self->{freq_counter}->[2] = 10; |
| 96 | +} # reset |
| 97 | + |
| 98 | +sub get_charset_name ($) { 'macintosh' } |
| 99 | + |
| 100 | +sub handle_data ($$) { |
| 101 | + my $self = $_[0]; |
| 102 | + my $new_buf1 = $self->filter_with_english_letters ($_[1]); |
| 103 | + |
| 104 | + for my $i (0..((length $new_buf1) - 1)) { |
| 105 | + my $c = ord substr $new_buf1, $i, 1; |
| 106 | + my $char_class = $MacRoman_CharToClass->[$c]; |
| 107 | + my $freq = $MacRomanClassModel->[$self->{last_char_class}*CLASS_NUM + $char_class]; |
| 108 | + if ($freq == 0) { |
| 109 | + $self->{state} = 'not me'; |
| 110 | + last; |
| 111 | + } |
| 112 | + $self->{freq_counter}->[$freq]++; |
| 113 | + $self->{last_char_class} = $char_class; |
| 114 | + } # $i |
| 115 | + |
| 116 | + return $self->{state}; |
| 117 | +} # handle_data |
| 118 | + |
| 119 | +sub get_confidence ($) { |
| 120 | + my $self = $_[0]; |
| 121 | + if ($self->{state} eq 'not me') { |
| 122 | + return 0.01; |
| 123 | + } |
| 124 | + |
| 125 | + my $total = 0; |
| 126 | + for my $i (0..(FREQ_CAT_NUM - 1)) { |
| 127 | + $total += $self->{freq_counter}->[$i]; |
| 128 | + } |
| 129 | + |
| 130 | + my $confidence; |
| 131 | + if ($total < 0.01) { |
| 132 | + $confidence = 0.0; |
| 133 | + } else { |
| 134 | + $confidence = ($self->{freq_counter}->[3] - $self->{freq_counter}->[1] * 20.0) / $total; |
| 135 | + } |
| 136 | + $confidence = 0.0 if $confidence < 0.0; |
| 137 | + |
| 138 | + ## lower the confidence of MacRoman so that other more accurate |
| 139 | + ## detector can take priority. |
| 140 | + $confidence *= 0.73; |
| 141 | + |
| 142 | + return $confidence; |
| 143 | +} # get_confidence |
| 144 | + |
| 145 | +sub dump_status ($) { |
| 146 | + my $self = $_[0]; |
| 147 | + printf " MacRomanProber: %1.3f [%s]\n", |
| 148 | + $self->get_confidence, $self->get_charset_name; |
| 149 | +} # dump_status |
| 150 | + |
| 151 | +1; |
| 152 | + |
| 153 | +=head1 AUTHOR |
| 154 | +
|
| 155 | +Wakaba <wakaba@suikawiki.org>. |
| 156 | +
|
| 157 | +=head1 ACKNOWLEDGEMENTS |
| 158 | +
|
| 159 | +This module derived from |
| 160 | +<https://github.com/chardet/chardet/commit/c292b52a97e57c95429ef559af36845019b88b33>. |
| 161 | +
|
| 162 | +=head1 LICENSE |
| 163 | +
|
| 164 | +######################## BEGIN LICENSE BLOCK ######################## |
| 165 | +# This code was modified from latin1prober.py by Rob Speer <rob@lumino.so>. |
| 166 | +# The Original Code is Mozilla Universal charset detector code. |
| 167 | +# |
| 168 | +# The Initial Developer of the Original Code is |
| 169 | +# Netscape Communications Corporation. |
| 170 | +# Portions created by the Initial Developer are Copyright (C) 2001 |
| 171 | +# the Initial Developer. All Rights Reserved. |
| 172 | +# |
| 173 | +# Contributor(s): |
| 174 | +# Wakaba <wakaba@suikawiki.org> |
| 175 | +# Rob Speer - adapt to MacRoman encoding |
| 176 | +# Mark Pilgrim - port to Python |
| 177 | +# Shy Shalom - original C code |
| 178 | +# |
| 179 | +# This library is free software; you can redistribute it and/or |
| 180 | +# modify it under the terms of the GNU Lesser General Public |
| 181 | +# License as published by the Free Software Foundation; either |
| 182 | +# version 2.1 of the License, or (at your option) any later version. |
| 183 | +# |
| 184 | +# This library is distributed in the hope that it will be useful, |
| 185 | +# but WITHOUT ANY WARRANTY; without even the implied warranty of |
| 186 | +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 187 | +# Lesser General Public License for more details. |
| 188 | +# |
| 189 | +# You should have received a copy of the GNU Lesser General Public |
| 190 | +# License along with this library; if not, write to the Free Software |
| 191 | +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA |
| 192 | +# 02110-1301 USA |
| 193 | +######################### END LICENSE BLOCK ######################### |
| 194 | +
|
| 195 | +=cut |
0 commit comments