diff --git a/wasm/Cargo.lock b/wasm/Cargo.lock index fa030bd8..bcf459b7 100644 --- a/wasm/Cargo.lock +++ b/wasm/Cargo.lock @@ -4,15 +4,9 @@ version = 3 [[package]] name = "bumpalo" -version = "3.11.1" +version = "3.14.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "572f695136211188308f16ad2ca5c851a712c464060ae6974944458eb83880ba" - -[[package]] -name = "cfg-if" -version = "0.1.10" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4785bdd1c96b2a846b2bd7cc02e86b6b3dbf14e7e53446c4f54c92a361040822" +checksum = "7f30e7476521f6f8af1a1c4c0b8cc94f0bee37d91763d0ca2665f299b6cd8aec" [[package]] name = "cfg-if" @@ -26,24 +20,15 @@ version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a06aeb73f470f66dcdbf7223caeebb85984942f22f1adb2a088cf9668146bbbc" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "wasm-bindgen", ] [[package]] name = "itoa" -version = "1.0.4" +version = "1.0.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4217ad341ebadf8d8e724e264f13e593e0648f5b3e94b3896a5df283be015ecc" - -[[package]] -name = "js-sys" -version = "0.3.61" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "445dde2150c55e483f3d8416706b97ec8e8237c307e5b7b4b8dd15e6af2a0730" -dependencies = [ - "wasm-bindgen", -] +checksum = "b1a46d1a171d865aa5f83f92695765caa047a9b4cbae2cbf37dbd613a793fd4c" [[package]] name = "lazy_static" @@ -51,77 +36,56 @@ version = "1.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "e2abad23fbc42b3700f2f279844dc832adb2b2eb069b2df918f455c4e18cc646" -[[package]] -name = "libc" -version = "0.2.135" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "68783febc7782c6c5cb401fbda4de5a9898be1762314da0bb2c10ced61f18b0c" - [[package]] name = "log" -version = "0.4.17" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "abb12e687cfb44aa40f41fc3978ef76448f9b6038cad6aef4259d3c095a2382e" -dependencies = [ - "cfg-if 1.0.0", -] - -[[package]] -name = "memory_units" -version = "0.4.0" +version = "0.4.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8452105ba047068f40ff7093dd1d9da90898e63dd61736462e9cdda6a90ad3c3" +checksum = "b5e6163cb8c49088c2c36f57875e58ccd8c87c7427f7fbd50ea6710b2f3f2e8f" [[package]] name = "once_cell" -version = "1.15.0" +version = "1.19.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e82dad04139b71a90c080c8463fe0dc7902db5192d939bd0950f074d014339e1" +checksum = "3fdb12b2476b595f9358c5161aa467c2438859caa136dec86c26fdd2efe17b92" [[package]] name = "proc-macro2" -version = "1.0.50" +version = "1.0.70" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ef7d57beacfaf2d8aee5937dab7b7f28de3cb8b1828479bb5de2a7106f2bae2" +checksum = "39278fbbf5fb4f646ce651690877f89d1c5811a3d4acb27700c1cb3cdb78fd3b" dependencies = [ "unicode-ident", ] [[package]] name = "quote" -version = "1.0.23" +version = "1.0.33" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "8856d8364d252a14d474036ea1358d63c9e6965c8e5c1885c18f73d70bff9c7b" +checksum = "5267fca4496028628a95160fc423a33e8b2e6af8a5302579e322e4b520293cae" dependencies = [ "proc-macro2", ] [[package]] name = "ryu" -version = "1.0.11" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4501abdff3ae82a1c1b477a17252eb69cee9e66eb915c1abaa4f44d873df9f09" - -[[package]] -name = "scoped-tls" -version = "1.0.0" +version = "1.0.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ea6a9290e3c9cf0f18145ef7ffa62d68ee0bf5fcd651017e586dc7fd5da448c2" +checksum = "f98d2aa92eebf49b69786be48e4477826b256916e84a57ff2a4f21923b48eb4c" [[package]] name = "serde" -version = "1.0.152" +version = "1.0.193" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bb7d1f0d3021d347a83e556fc4683dea2ea09d87bccdf88ff5c12545d89d5efb" +checksum = "25dd9975e68d0cb5aa1120c288333fc98731bd1dd12f561e468ea4728c042b89" dependencies = [ "serde_derive", ] [[package]] name = "serde_derive" -version = "1.0.152" +version = "1.0.193" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af487d118eecd09402d70a5d72551860e788df87b464af30e5ea6a38c75c541e" +checksum = "43576ca501357b9b071ac53cdc7da8ef0cbd9493d8df094cd821777ea6e894d3" dependencies = [ "proc-macro2", "quote", @@ -130,9 +94,9 @@ dependencies = [ [[package]] name = "serde_json" -version = "1.0.91" +version = "1.0.108" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "877c235533714907a8c2464236f5c4b2a17262ef1bd71f38f35ea592c8da6883" +checksum = "3d1c7e3eac408d115102c4c24ad393e0821bb3a5df4d506a80f85f7a742a526b" dependencies = [ "itoa", "ryu", @@ -141,9 +105,9 @@ dependencies = [ [[package]] name = "syn" -version = "1.0.105" +version = "2.0.41" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "60b9b43d45702de4c839cb9b51d9f529c5dd26a4aff255b42b1ebc03e88ee908" +checksum = "44c8b28c477cc3bf0e7966561e3460130e1255f7a1cf71931075f1c5e7a7e269" dependencies = [ "proc-macro2", "quote", @@ -152,15 +116,9 @@ dependencies = [ [[package]] name = "unicode-ident" -version = "1.0.5" +version = "1.0.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6ceab39d59e4c9499d4e5a8ee0e2735b891bb7308ac83dfb4e80cad195c9f6f3" - -[[package]] -name = "unicode-rs" -version = "0.1.3" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3213cd2f9054a269d117977ace8a58ab57a4244b814e42f2562ebf6197e83f7f" +checksum = "3354b9ac3fae1ff6755cb6db53683adb661634f67557942dea4facebec0fee4b" [[package]] name = "unicode-rustwasm" @@ -172,27 +130,24 @@ dependencies = [ "quote", "serde", "serde_json", - "unicode-rs", "wasm-bindgen", - "wasm-bindgen-test", - "wee_alloc", ] [[package]] name = "wasm-bindgen" -version = "0.2.84" +version = "0.2.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "31f8dcbc21f30d9b8f2ea926ecb58f6b91192c17e9d33594b3df58b2007ca53b" +checksum = "0ed0d4f68a3015cc185aff4db9506a015f4b96f95303897bfa23f846db54064e" dependencies = [ - "cfg-if 1.0.0", + "cfg-if", "wasm-bindgen-macro", ] [[package]] name = "wasm-bindgen-backend" -version = "0.2.84" +version = "0.2.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "95ce90fd5bcc06af55a641a86428ee4229e44e07033963a2290a8e241607ccb9" +checksum = "1b56f625e64f3a1084ded111c4d5f477df9f8c92df113852fa5a374dbda78826" dependencies = [ "bumpalo", "log", @@ -203,23 +158,11 @@ dependencies = [ "wasm-bindgen-shared", ] -[[package]] -name = "wasm-bindgen-futures" -version = "0.4.34" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f219e0d211ba40266969f6dbdd90636da12f75bee4fc9d6c23d1260dadb51454" -dependencies = [ - "cfg-if 1.0.0", - "js-sys", - "wasm-bindgen", - "web-sys", -] - [[package]] name = "wasm-bindgen-macro" -version = "0.2.84" +version = "0.2.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4c21f77c0bedc37fd5dc21f897894a5ca01e7bb159884559461862ae90c0b4c5" +checksum = "0162dbf37223cd2afce98f3d0785506dcb8d266223983e4b5b525859e6e182b2" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -227,9 +170,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.84" +version = "0.2.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2aff81306fcac3c7515ad4e177f521b5c9a15f2b08f4e32d823066102f35a5f6" +checksum = "f0eb82fcb7930ae6219a7ecfd55b217f5f0893484b7a13022ebb2b2bf20b5283" dependencies = [ "proc-macro2", "quote", @@ -240,74 +183,6 @@ dependencies = [ [[package]] name = "wasm-bindgen-shared" -version = "0.2.84" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0046fef7e28c3804e5e38bfa31ea2a0f73905319b677e57ebe37e49358989b5d" - -[[package]] -name = "wasm-bindgen-test" -version = "0.3.34" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6db36fc0f9fb209e88fb3642590ae0205bb5a56216dabd963ba15879fe53a30b" -dependencies = [ - "console_error_panic_hook", - "js-sys", - "scoped-tls", - "wasm-bindgen", - "wasm-bindgen-futures", - "wasm-bindgen-test-macro", -] - -[[package]] -name = "wasm-bindgen-test-macro" -version = "0.3.34" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0734759ae6b3b1717d661fe4f016efcfb9828f5edb4520c18eaee05af3b43be9" -dependencies = [ - "proc-macro2", - "quote", -] - -[[package]] -name = "web-sys" -version = "0.3.60" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bcda906d8be16e728fd5adc5b729afad4e444e106ab28cd1c7256e54fa61510f" -dependencies = [ - "js-sys", - "wasm-bindgen", -] - -[[package]] -name = "wee_alloc" -version = "0.4.5" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "dbb3b5a6b2bb17cb6ad44a2e68a43e8d2722c997da10e928665c72ec6c0a0b8e" -dependencies = [ - "cfg-if 0.1.10", - "libc", - "memory_units", - "winapi", -] - -[[package]] -name = "winapi" -version = "0.3.9" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5c839a674fcd7a98952e593242ea400abe93992746761e38641405d28b00f419" -dependencies = [ - "winapi-i686-pc-windows-gnu", - "winapi-x86_64-pc-windows-gnu", -] - -[[package]] -name = "winapi-i686-pc-windows-gnu" -version = "0.4.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" - -[[package]] -name = "winapi-x86_64-pc-windows-gnu" -version = "0.4.0" +version = "0.2.89" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +checksum = "7ab9b36309365056cd639da3134bf87fa8f3d86008abf99e612384a6eecd459f" diff --git a/wasm/Cargo.toml b/wasm/Cargo.toml index 7bf96a25..463e287b 100644 --- a/wasm/Cargo.toml +++ b/wasm/Cargo.toml @@ -11,35 +11,21 @@ publish = false [lib] crate-type = ["cdylib", "rlib"] -[features] -default = ["console_error_panic_hook"] - [dependencies] -wasm-bindgen = "0.2" -serde = { version = "1.0", features = ["derive"] } -serde_json = "1.0.91" +wasm-bindgen = "0.2.89" +serde = { version = "1.0.193", features = ["derive"] } +serde_json = "1.0.108" lazy_static = "1.4.0" -unicode-rs = "0.1.2" # The `console_error_panic_hook` crate provides better debugging of panics by # logging them with `console.error`. This is great for development, but requires # all the `std::fmt` and `std::panicking` infrastructure, so isn't great for # code size when deploying. -console_error_panic_hook = { version = "0.1.1", optional = true } - -# `wee_alloc` is a tiny allocator for wasm that is only ~1K in code size -# compared to the default allocator's ~10K. It is slower than the default -# allocator, however. -# -# Unfortunately, `wee_alloc` requires nightly Rust when targeting wasm for now. -wee_alloc = { version = "0.4.2", optional = true } - -[dev-dependencies] -wasm-bindgen-test = "0.3" +console_error_panic_hook = { version = "0.1.7" } [build-dependencies] -proc-macro2 = "1.0.50" -quote = "1.0.23" +proc-macro2 = "1.0.70" +quote = "1.0.33" [profile.release] # Tell `rustc` to optimize for small code size. diff --git a/wasm/src/codepoint_type.rs b/wasm/src/codepoint_type.rs new file mode 100644 index 00000000..221a7768 --- /dev/null +++ b/wasm/src/codepoint_type.rs @@ -0,0 +1,38 @@ +use core::fmt; + +/// This enum describes the seven basic types of codepoints. +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub enum CodepointType { + /// Letter, mark, number, punctuation, symbol, and spaces + Graphic, + /// Invisible but affects neighboring characters; includes line/paragraph separators + Format, + /// Usage defined by protocols or standards outside the Unicode Standard + Control, + /// Usage defined by private agreement outside the Unicode Standard + PrivateUse, + /// Permanently reserved for UTF-16; restricted interchange + Surrogate, + /// Permanently reserved for internal usage; restricted interchange + Noncharacter, + /// Reserved for future assignment; restricted interchange + Reserved, +} + +impl fmt::Display for CodepointType { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "{}", + match self { + CodepointType::Graphic => "Graphic", + CodepointType::Format => "Format", + CodepointType::Control => "Control", + CodepointType::PrivateUse => "Private-use", + CodepointType::Surrogate => "Surrogate", + CodepointType::Noncharacter => "Noncharacter", + CodepointType::Reserved => "Reserved", + } + ) + } +} diff --git a/wasm/src/general_category.rs b/wasm/src/general_category.rs new file mode 100644 index 00000000..fb97e099 --- /dev/null +++ b/wasm/src/general_category.rs @@ -0,0 +1,191 @@ +use crate::codepoint_type::CodepointType; +use core::str::FromStr; + +/// Values for the General_Category (gc) character property. +/// These values are fixed; no new values will be added. +#[derive(Debug, Copy, Clone, PartialEq, Eq, Hash)] +pub enum GeneralCategory { + /// Letter, uppercase + Lu, + /// Letter, lowercase + Ll, + /// Letter, titlecase + Lt, + /// Letter, modifier + Lm, + /// Letter, other + Lo, + /// Mark, nonspacing + Mn, + /// Mark, spacing combining + Mc, + /// Mark, enclosing + Me, + /// Number, decimal digit + Nd, + /// Number, letter + Nl, + /// Number, other + No, + /// Punctuation, connector + Pc, + /// Punctuation, dash + Pd, + /// Punctuation, open + Ps, + /// Punctuation, close + Pe, + /// Punctuation, initial quote + Pi, + /// Punctuation, final quote + Pf, + /// Punctuation, other + Po, + /// Symbol, math + Sm, + /// Symbol, currency + Sc, + /// Symbol, modifier + Sk, + /// Symbol, other + So, + /// Separator, space + Zs, + /// Separator, line + Zl, + /// Separator, paragraph + Zp, + /// Other, control + Cc, + /// Other, format + Cf, + /// Other, surrogate + Cs, + /// Other, private use + Co, + /// Other, not assigned + Cn, +} + +impl GeneralCategory { + /// This method returns the long name (e.g. "Letter, uppercase") for the given General_Category. + /// The values are stable; they will not change in future Unicode versions. + pub fn long_name(self) -> &'static str { + match self { + GeneralCategory::Lu => "Letter, uppercase", + GeneralCategory::Ll => "Letter, lowercase", + GeneralCategory::Lt => "Letter, titlecase", + GeneralCategory::Lm => "Letter, modifier", + GeneralCategory::Lo => "Letter, other", + GeneralCategory::Mn => "Mark, nonspacing", + GeneralCategory::Mc => "Mark, spacing combining", + GeneralCategory::Me => "Mark, enclosing", + GeneralCategory::Nd => "Number, decimal digit", + GeneralCategory::Nl => "Number, letter", + GeneralCategory::No => "Number, other", + GeneralCategory::Pc => "Punctuation, connector", + GeneralCategory::Pd => "Punctuation, dash", + GeneralCategory::Ps => "Punctuation, open", + GeneralCategory::Pe => "Punctuation, close", + GeneralCategory::Pi => "Punctuation, initial quote", + GeneralCategory::Pf => "Punctuation, final quote", + GeneralCategory::Po => "Punctuation, other", + GeneralCategory::Sm => "Symbol, math", + GeneralCategory::Sc => "Symbol, currency", + GeneralCategory::Sk => "Symbol, modifier", + GeneralCategory::So => "Symbol, other", + GeneralCategory::Zs => "Separator, space", + GeneralCategory::Zl => "Separator, line", + GeneralCategory::Zp => "Separator, paragraph", + GeneralCategory::Cc => "Other, control", + GeneralCategory::Cf => "Other, format", + GeneralCategory::Cs => "Other, surrogate", + GeneralCategory::Co => "Other, private use", + GeneralCategory::Cn => "Other, not assigned", + } + } + + pub fn codepoint_type(self, codepoint: u32) -> CodepointType { + match self { + GeneralCategory::Cc => CodepointType::Control, + GeneralCategory::Co => CodepointType::PrivateUse, + GeneralCategory::Cs => CodepointType::Surrogate, + GeneralCategory::Cf | GeneralCategory::Zl | GeneralCategory::Zp => { + CodepointType::Format + } + GeneralCategory::Cn => match codepoint { + 0x00_fdd0..=0x00_fdef => CodepointType::Noncharacter, + 0x00_fffe..=0x00_ffff => CodepointType::Noncharacter, + 0x01_fffe..=0x01_ffff => CodepointType::Noncharacter, + 0x02_fffe..=0x02_ffff => CodepointType::Noncharacter, + 0x03_fffe..=0x03_ffff => CodepointType::Noncharacter, + 0x04_fffe..=0x04_ffff => CodepointType::Noncharacter, + 0x05_fffe..=0x05_ffff => CodepointType::Noncharacter, + 0x06_fffe..=0x06_ffff => CodepointType::Noncharacter, + 0x07_fffe..=0x07_ffff => CodepointType::Noncharacter, + 0x08_fffe..=0x08_ffff => CodepointType::Noncharacter, + 0x09_fffe..=0x09_ffff => CodepointType::Noncharacter, + 0x0a_fffe..=0x0a_ffff => CodepointType::Noncharacter, + 0x0b_fffe..=0x0b_ffff => CodepointType::Noncharacter, + 0x0c_fffe..=0x0c_ffff => CodepointType::Noncharacter, + 0x0d_fffe..=0x0d_ffff => CodepointType::Noncharacter, + 0x0e_fffe..=0x0e_ffff => CodepointType::Noncharacter, + 0x0f_fffe..=0x0f_ffff => CodepointType::Noncharacter, + 0x10_fffe..=0x10_ffff => CodepointType::Noncharacter, + _ => CodepointType::Reserved, + }, + _ => CodepointType::Graphic, + } + } +} + +impl FromStr for GeneralCategory { + type Err = (); + + fn from_str(s: &str) -> Result { + match s { + "Lu" => Ok(GeneralCategory::Lu), + "Ll" => Ok(GeneralCategory::Ll), + "Lt" => Ok(GeneralCategory::Lt), + "Lm" => Ok(GeneralCategory::Lm), + "Lo" => Ok(GeneralCategory::Lo), + "Mn" => Ok(GeneralCategory::Mn), + "Mc" => Ok(GeneralCategory::Mc), + "Me" => Ok(GeneralCategory::Me), + "Nd" => Ok(GeneralCategory::Nd), + "Nl" => Ok(GeneralCategory::Nl), + "No" => Ok(GeneralCategory::No), + "Pc" => Ok(GeneralCategory::Pc), + "Pd" => Ok(GeneralCategory::Pd), + "Ps" => Ok(GeneralCategory::Ps), + "Pe" => Ok(GeneralCategory::Pe), + "Pi" => Ok(GeneralCategory::Pi), + "Pf" => Ok(GeneralCategory::Pf), + "Po" => Ok(GeneralCategory::Po), + "Sm" => Ok(GeneralCategory::Sm), + "Sc" => Ok(GeneralCategory::Sc), + "Sk" => Ok(GeneralCategory::Sk), + "So" => Ok(GeneralCategory::So), + "Zs" => Ok(GeneralCategory::Zs), + "Zl" => Ok(GeneralCategory::Zl), + "Zp" => Ok(GeneralCategory::Zp), + "Cc" => Ok(GeneralCategory::Cc), + "Cf" => Ok(GeneralCategory::Cf), + "Cs" => Ok(GeneralCategory::Cs), + "Co" => Ok(GeneralCategory::Co), + "Cn" => Ok(GeneralCategory::Cn), + _ => Err(()), + } + } +} + +#[cfg(test)] +mod test { + use crate::general_category::GeneralCategory; + + #[test] + fn category_name() { + assert_eq!(GeneralCategory::Lu.long_name(), "Letter, uppercase"); + assert_eq!(GeneralCategory::Sc.long_name(), "Symbol, currency"); + } +} diff --git a/wasm/src/lib.rs b/wasm/src/lib.rs index 24c4e93c..95f59af2 100644 --- a/wasm/src/lib.rs +++ b/wasm/src/lib.rs @@ -1,4 +1,6 @@ +mod codepoint_type; mod data; +mod general_category; mod utf_encodings; mod utils; @@ -332,7 +334,7 @@ pub fn decode_str(encoding_name: &str, code_units: Vec) -> Option> #[wasm_bindgen] pub fn long_category_name_for_short_name(short_name: &str) -> Option { use std::str::FromStr; - use unicode_rs::general_category::GeneralCategory; + use crate::general_category::GeneralCategory; Some( GeneralCategory::from_str(short_name) @@ -348,7 +350,7 @@ pub fn basic_type_for_codepoint( codepoint: u32, ) -> Option { use std::str::FromStr; - use unicode_rs::general_category::GeneralCategory; + use crate::general_category::GeneralCategory; Some( GeneralCategory::from_str(short_general_category_name)