Skip to content

Commit

Permalink
Trim bloated encoding table
Browse files Browse the repository at this point in the history
  • Loading branch information
kornelski committed Nov 6, 2024
1 parent 16a4a49 commit e3d52ff
Show file tree
Hide file tree
Showing 2 changed files with 12 additions and 204 deletions.
205 changes: 9 additions & 196 deletions src/base/encoding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -5,193 +5,31 @@ use std::sync::Arc;

/// This serves as a map from integer to [`Encoding`], which allows more efficient
/// sets/gets of the [`SharedEncoding`].
static ALL_ENCODINGS: [&Encoding; 228] = [
&encoding_rs::WINDOWS_1252_INIT,
&encoding_rs::ISO_8859_2_INIT,
&encoding_rs::ISO_8859_3_INIT,
&encoding_rs::ISO_8859_4_INIT,
&encoding_rs::WINDOWS_1254_INIT,
&encoding_rs::ISO_8859_10_INIT,
&encoding_rs::ISO_8859_15_INIT,
&encoding_rs::IBM866_INIT,
&encoding_rs::MACINTOSH_INIT,
&encoding_rs::KOI8_R_INIT,
&encoding_rs::GBK_INIT,
&encoding_rs::BIG5_INIT,
static ALL_ENCODINGS: [&Encoding; 40] = [
&encoding_rs::UTF_8_INIT,
&encoding_rs::KOI8_R_INIT,
&encoding_rs::SHIFT_JIS_INIT,
&encoding_rs::UTF_16LE_INIT,
&encoding_rs::SHIFT_JIS_INIT,
&encoding_rs::IBM866_INIT,
&encoding_rs::UTF_8_INIT,
&encoding_rs::WINDOWS_1252_INIT,
&encoding_rs::WINDOWS_1252_INIT,
&encoding_rs::GBK_INIT,
&encoding_rs::ISO_8859_7_INIT,
&encoding_rs::WINDOWS_1250_INIT,
&encoding_rs::WINDOWS_1251_INIT,
&encoding_rs::WINDOWS_1252_INIT,
&encoding_rs::GBK_INIT,
&encoding_rs::WINDOWS_1252_INIT,
&encoding_rs::ISO_8859_2_INIT,
&encoding_rs::WINDOWS_1253_INIT,
&encoding_rs::ISO_8859_3_INIT,
&encoding_rs::WINDOWS_1254_INIT,
&encoding_rs::ISO_8859_4_INIT,
&encoding_rs::WINDOWS_1255_INIT,
&encoding_rs::BIG5_INIT,
&encoding_rs::WINDOWS_1254_INIT,
&encoding_rs::UTF_16LE_INIT,
&encoding_rs::WINDOWS_1256_INIT,
&encoding_rs::IBM866_INIT,
&encoding_rs::ISO_8859_10_INIT,
&encoding_rs::WINDOWS_1257_INIT,
&encoding_rs::WINDOWS_1258_INIT,
&encoding_rs::ISO_8859_7_INIT,
&encoding_rs::WINDOWS_1252_INIT,
&encoding_rs::ISO_8859_6_INIT,
&encoding_rs::ISO_8859_8_INIT,
&encoding_rs::EUC_KR_INIT,
&encoding_rs::EUC_JP_INIT,
&encoding_rs::KOI8_R_INIT,
&encoding_rs::KOI8_R_INIT,
&encoding_rs::EUC_KR_INIT,
&encoding_rs::SHIFT_JIS_INIT,
&encoding_rs::KOI8_U_INIT,
&encoding_rs::ISO_8859_8_INIT,
&encoding_rs::WINDOWS_874_INIT,
&encoding_rs::GB18030_INIT,
&encoding_rs::EUC_KR_INIT,
&encoding_rs::GBK_INIT,
&encoding_rs::WINDOWS_874_INIT,
&encoding_rs::BIG5_INIT,
&encoding_rs::UTF_16LE_INIT,
&encoding_rs::GBK_INIT,
&encoding_rs::ISO_8859_8_I_INIT,
&encoding_rs::KOI8_R_INIT,
&encoding_rs::EUC_KR_INIT,
&encoding_rs::KOI8_U_INIT,
&encoding_rs::WINDOWS_1250_INIT,
&encoding_rs::EUC_KR_INIT,
&encoding_rs::WINDOWS_1251_INIT,
&encoding_rs::WINDOWS_1252_INIT,
&encoding_rs::GBK_INIT,
&encoding_rs::WINDOWS_1252_INIT,
&encoding_rs::ISO_8859_2_INIT,
&encoding_rs::WINDOWS_1253_INIT,
&encoding_rs::ISO_8859_3_INIT,
&encoding_rs::ISO_8859_6_INIT,
&encoding_rs::WINDOWS_1254_INIT,
&encoding_rs::ISO_8859_4_INIT,
&encoding_rs::WINDOWS_1255_INIT,
&encoding_rs::ISO_8859_5_INIT,
&encoding_rs::BIG5_INIT,
&encoding_rs::WINDOWS_1256_INIT,
&encoding_rs::IBM866_INIT,
&encoding_rs::ISO_8859_6_INIT,
&encoding_rs::WINDOWS_1257_INIT,
&encoding_rs::ISO_8859_7_INIT,
&encoding_rs::ISO_8859_6_INIT,
&encoding_rs::ISO_8859_7_INIT,
&encoding_rs::ISO_8859_7_INIT,
&encoding_rs::WINDOWS_1258_INIT,
&encoding_rs::ISO_8859_8_INIT,
&encoding_rs::WINDOWS_1254_INIT,
&encoding_rs::ISO_8859_5_INIT,
&encoding_rs::UTF_16BE_INIT,
&encoding_rs::UTF_16LE_INIT,
&encoding_rs::WINDOWS_1252_INIT,
&encoding_rs::SHIFT_JIS_INIT,
&encoding_rs::EUC_JP_INIT,
&encoding_rs::ISO_8859_10_INIT,
&encoding_rs::WINDOWS_1252_INIT,
&encoding_rs::WINDOWS_874_INIT,
&encoding_rs::ISO_8859_2_INIT,
&encoding_rs::ISO_8859_3_INIT,
&encoding_rs::ISO_8859_13_INIT,
&encoding_rs::ISO_8859_4_INIT,
&encoding_rs::ISO_8859_14_INIT,
&encoding_rs::ISO_8859_5_INIT,
&encoding_rs::ISO_8859_15_INIT,
&encoding_rs::ISO_8859_6_INIT,
&encoding_rs::ISO_8859_7_INIT,
&encoding_rs::ISO_8859_8_INIT,
&encoding_rs::GBK_INIT,
&encoding_rs::WINDOWS_1254_INIT,
&encoding_rs::UTF_16LE_INIT,
&encoding_rs::MACINTOSH_INIT,
&encoding_rs::SHIFT_JIS_INIT,
&encoding_rs::SHIFT_JIS_INIT,
&encoding_rs::WINDOWS_1252_INIT,
&encoding_rs::ISO_8859_10_INIT,
&encoding_rs::ISO_8859_4_INIT,
&encoding_rs::GBK_INIT,
&encoding_rs::WINDOWS_1252_INIT,
&encoding_rs::WINDOWS_1252_INIT,
&encoding_rs::ISO_8859_2_INIT,
&encoding_rs::WINDOWS_874_INIT,
&encoding_rs::ISO_8859_2_INIT,
&encoding_rs::ISO_8859_2_INIT,
&encoding_rs::REPLACEMENT_INIT,
&encoding_rs::ISO_8859_3_INIT,
&encoding_rs::ISO_8859_3_INIT,
&encoding_rs::ISO_8859_13_INIT,
&encoding_rs::ISO_8859_4_INIT,
&encoding_rs::ISO_8859_4_INIT,
&encoding_rs::ISO_8859_14_INIT,
&encoding_rs::ISO_8859_5_INIT,
&encoding_rs::ISO_8859_5_INIT,
&encoding_rs::ISO_8859_5_INIT,
&encoding_rs::ISO_8859_15_INIT,
&encoding_rs::ISO_8859_6_INIT,
&encoding_rs::ISO_8859_6_INIT,
&encoding_rs::ISO_8859_7_INIT,
&encoding_rs::ISO_8859_7_INIT,
&encoding_rs::ISO_8859_7_INIT,
&encoding_rs::ISO_8859_6_INIT,
&encoding_rs::ISO_8859_10_INIT,
&encoding_rs::ISO_8859_8_INIT,
&encoding_rs::ISO_8859_8_INIT,
&encoding_rs::ISO_8859_8_I_INIT,
&encoding_rs::ISO_8859_8_INIT,
&encoding_rs::WINDOWS_1254_INIT,
&encoding_rs::WINDOWS_1254_INIT,
&encoding_rs::WINDOWS_1254_INIT,
&encoding_rs::ISO_8859_3_INIT,
&encoding_rs::EUC_KR_INIT,
&encoding_rs::BIG5_INIT,
&encoding_rs::SHIFT_JIS_INIT,
&encoding_rs::ISO_8859_10_INIT,
&encoding_rs::WINDOWS_874_INIT,
&encoding_rs::WINDOWS_1252_INIT,
&encoding_rs::ISO_8859_2_INIT,
&encoding_rs::ISO_8859_13_INIT,
&encoding_rs::ISO_8859_3_INIT,
&encoding_rs::ISO_8859_14_INIT,
&encoding_rs::WINDOWS_874_INIT,
&encoding_rs::ISO_8859_4_INIT,
&encoding_rs::ISO_8859_15_INIT,
&encoding_rs::ISO_8859_15_INIT,
&encoding_rs::WINDOWS_1254_INIT,
&encoding_rs::ISO_8859_16_INIT,
&encoding_rs::ISO_8859_10_INIT,
&encoding_rs::EUC_KR_INIT,
&encoding_rs::ISO_8859_15_INIT,
&encoding_rs::ISO_8859_6_INIT,
&encoding_rs::ISO_8859_8_INIT,
&encoding_rs::UTF_16BE_INIT,
&encoding_rs::UTF_16LE_INIT,
&encoding_rs::MACINTOSH_INIT,
&encoding_rs::ISO_8859_6_INIT,
&encoding_rs::ISO_8859_8_I_INIT,
&encoding_rs::SHIFT_JIS_INIT,
&encoding_rs::KOI8_R_INIT,
&encoding_rs::KOI8_U_INIT,
&encoding_rs::MACINTOSH_INIT,
&encoding_rs::REPLACEMENT_INIT,
&encoding_rs::ISO_2022_JP_INIT,
&encoding_rs::ISO_2022_JP_INIT,
&encoding_rs::REPLACEMENT_INIT,
&encoding_rs::REPLACEMENT_INIT,
&encoding_rs::REPLACEMENT_INIT,
&encoding_rs::WINDOWS_1250_INIT,
&encoding_rs::WINDOWS_1251_INIT,
&encoding_rs::WINDOWS_1252_INIT,
Expand All @@ -201,39 +39,14 @@ static ALL_ENCODINGS: [&Encoding; 228] = [
&encoding_rs::WINDOWS_1256_INIT,
&encoding_rs::WINDOWS_1257_INIT,
&encoding_rs::WINDOWS_1258_INIT,
&encoding_rs::ISO_8859_6_INIT,
&encoding_rs::ISO_8859_8_INIT,
&encoding_rs::ISO_8859_6_INIT,
&encoding_rs::ISO_8859_8_I_INIT,
&encoding_rs::ISO_8859_7_INIT,
&encoding_rs::EUC_KR_INIT,
&encoding_rs::UTF_8_INIT,
&encoding_rs::UTF_8_INIT,
&encoding_rs::EUC_KR_INIT,
&encoding_rs::WINDOWS_1252_INIT,
&encoding_rs::EUC_KR_INIT,
&encoding_rs::WINDOWS_874_INIT,
&encoding_rs::X_MAC_CYRILLIC_INIT,
&encoding_rs::X_USER_DEFINED_INIT,
&encoding_rs::GBK_INIT,
&encoding_rs::UTF_16LE_INIT,
&encoding_rs::WINDOWS_1252_INIT,
&encoding_rs::ISO_8859_2_INIT,
&encoding_rs::ISO_8859_6_INIT,
&encoding_rs::ISO_8859_7_INIT,
&encoding_rs::ISO_8859_3_INIT,
&encoding_rs::ISO_8859_4_INIT,
&encoding_rs::ISO_8859_5_INIT,
&encoding_rs::ISO_8859_8_INIT,
&encoding_rs::UTF_8_INIT,
&encoding_rs::WINDOWS_1254_INIT,
&encoding_rs::ISO_8859_7_INIT,
&encoding_rs::X_MAC_CYRILLIC_INIT,
// non-ASCII-compatible
&encoding_rs::REPLACEMENT_INIT,
&encoding_rs::ISO_8859_6_INIT,
&encoding_rs::ISO_8859_8_INIT,
&encoding_rs::UTF_8_INIT,
&encoding_rs::ISO_8859_5_INIT,
&encoding_rs::EUC_JP_INIT,
&encoding_rs::UTF_16BE_INIT,
&encoding_rs::UTF_16LE_INIT,
&encoding_rs::ISO_2022_JP_INIT,
];

fn encoding_to_index(encoding: AsciiCompatibleEncoding) -> usize {
Expand Down
11 changes: 3 additions & 8 deletions src/rewriter/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -31,17 +31,12 @@ impl AsciiCompatibleEncoding {
/// Returns `Some` if `Encoding` is ascii-compatible, or `None` otherwise.
#[must_use]
pub fn new(encoding: &'static Encoding) -> Option<Self> {
if encoding.is_ascii_compatible() {
Some(Self(encoding))
} else {
None
}
encoding.is_ascii_compatible().then_some(Self(encoding))
}

fn from_mimetype(mime: &Mime) -> Option<Self> {
mime.get_param("charset")
.and_then(|cs| Encoding::for_label_no_replacement(cs.as_str().as_bytes()))
.and_then(Self::new)
let cs = mime.get_param("charset")?;
Self::new(Encoding::for_label_no_replacement(cs.as_str().as_bytes())?)
}

/// Returns the most commonly used UTF-8 encoding.
Expand Down

0 comments on commit e3d52ff

Please sign in to comment.