Skip to content

Commit

Permalink
Pull in ahash by default behind a feature flag
Browse files Browse the repository at this point in the history
This is the same flag as in hashbrown. It improves the ergonomics of
initializing a dictionary: note the new usage of `Dictionary::new` in
the diff.
  • Loading branch information
the-mikedavis committed Aug 29, 2024
1 parent 0416e5c commit cb5e9e1
Show file tree
Hide file tree
Showing 9 changed files with 69 additions and 51 deletions.
14 changes: 10 additions & 4 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,14 @@ license = "MPL-2.0"
rust-version = "1.70"

[dependencies]
# Used for HashMap and RawTable for a custom bag structure.
hashbrown = { version = "0.14", default-features = false, features = ["raw"] }
# Used as the default BuildHasher when the `default-hasher` feature flag
# is enabled (which it is by default).
ahash = { version = "0.8", default-features = false, optional = true }

[dev-dependencies]
# Used as a BuildHasher for the hashing data types - hashbrown HashMap and a custom
# `HashMultiMap` - during unit tests.
ahash = { version = "0.8", default-features = false }
# Used in unit tests to prevent compiling the same dictionary twice. Used instead of
# Used in unit tests to lazily compile en_US. Used instead of
# `core::cell::OnceCell` since it implements `Send + Sync`.
once_cell = "1.19"
# Used in the integration tests to find and read Hunspell test case files, potentially
Expand All @@ -24,3 +25,8 @@ encoding_rs = "0.8"
chardetng = "0.1"
# Minimalist benchmarking crate.
brunch = "0.5"

[features]
default = ["default-hasher"]
# Sets a default hasher (currently ahash).
default-hasher = ["dep:ahash"]
6 changes: 5 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ Spellbook is a Rust spellchecker library compatible with the Hunspell dictionary
fn main() {
let aff = std::fs::read_to_string("en_US.aff").unwrap();
let dic = std::fs::read_to_string("en_US.dic").unwrap();
let dict = spellbook::Dictionary::new_with_hasher(&dic, &aff, ahash::RandomState::new());
let dict = spellbook::Dictionary::new(&dic, &aff);

let word = std::env::args().nth(1).expect("expected a word to check");

Expand All @@ -29,6 +29,10 @@ Currently the `check` API works well for `en_US` - a relatively simple dictionar

Spellbook should be considered to be in _alpha_. Part of the Hunspell test corpus has been ported and there are a healthy number of unit tests, but there are certainly bugs to be found.

### Feature flags

The only feature flag currently is `default-hasher` which pulls in [`ahash`](https://github.com/tkaitchuck/aHash) and is enabled by default like the equivalent flag from [`hashbrown`]. A non-cryptographic hash significantly improves the time it takes to check a word.

### How does it work?

For a more in depth overview, check out [`@zverok`]'s blog series [Rebuilding the spellchecker][zverok-blog].
Expand Down
3 changes: 1 addition & 2 deletions examples/check.rs
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@ fn main() {
};

let now = Instant::now();
let dict =
Dictionary::new_with_hasher(EN_US_DIC, EN_US_AFF, ahash::RandomState::new()).unwrap();
let dict = Dictionary::new(EN_US_DIC, EN_US_AFF).unwrap();
println!("Compiled the dictionary in {}ms", now.elapsed().as_millis());

let now = Instant::now();
Expand Down
3 changes: 1 addition & 2 deletions examples/prose.rs
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,7 @@ fn main() {
let mut misspelled = 0;

let now = Instant::now();
let dict =
Dictionary::new_with_hasher(EN_US_DIC, EN_US_AFF, ahash::RandomState::new()).unwrap();
let dict = Dictionary::new(EN_US_DIC, EN_US_AFF).unwrap();
println!("Compiled the dictionary in {}ms", now.elapsed().as_millis());

let now = Instant::now();
Expand Down
4 changes: 2 additions & 2 deletions src/aff.rs
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ use crate::{
has_flag, AffixingMode, Flag, FlagSet, WordList, AT_COMPOUND_BEGIN, AT_COMPOUND_END, FULL_WORD,
};

use core::{hash::BuildHasher, marker::PhantomData, num::NonZeroU16, str::Chars};
use core::{marker::PhantomData, num::NonZeroU16, str::Chars};

pub(crate) const HIDDEN_HOMONYM_FLAG: Flag = unsafe { Flag::new_unchecked(u16::MAX) };
pub(crate) const MAX_SUGGESTIONS: usize = 16;
Expand Down Expand Up @@ -1055,7 +1055,7 @@ impl CaseHandling {
}

#[derive(Debug)]
pub(crate) struct AffData<S: BuildHasher> {
pub(crate) struct AffData<S> {
// checking options
pub words: WordList<S>,
pub prefixes: PrefixIndex,
Expand Down
17 changes: 7 additions & 10 deletions src/checker.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2274,7 +2274,6 @@ impl<'aff> CompoundingResult<'aff> {

#[cfg(test)]
mod test {
use ahash::RandomState;
use once_cell::sync::Lazy;

use super::*;
Expand All @@ -2285,9 +2284,7 @@ mod test {

// It's a little overkill to use a real dictionary for unit tests but it compiles so
// quickly that if we only compile it once it doesn't really slow down the test suite.
static EN_US: Lazy<Dictionary<RandomState>> = Lazy::new(|| {
Dictionary::new_with_hasher(EN_US_DIC, EN_US_AFF, RandomState::new()).unwrap()
});
static EN_US: Lazy<Dictionary> = Lazy::new(|| Dictionary::new(EN_US_DIC, EN_US_AFF).unwrap());

#[test]
fn are_three_chars_equal_test() {
Expand Down Expand Up @@ -2413,7 +2410,7 @@ mod test {
bass
"#;

let dict = Dictionary::new_with_hasher(dic, aff, RandomState::new()).unwrap();
let dict = Dictionary::new(dic, aff).unwrap();

assert!(dict.check("aussaß"));
assert!(dict.check("Aussaß"));
Expand Down Expand Up @@ -2446,7 +2443,7 @@ mod test {
anch'Ella
"#;

let dict = Dictionary::new_with_hasher(dic, aff, RandomState::new()).unwrap();
let dict = Dictionary::new(dic, aff).unwrap();

assert!(dict.check("cent'anni"));
assert!(dict.check("d'Intelvi"));
Expand Down Expand Up @@ -2474,7 +2471,7 @@ mod test {
affine
affluent/Y
"#;
let dict = Dictionary::new_with_hasher(dic, aff, RandomState::new()).unwrap();
let dict = Dictionary::new(dic, aff).unwrap();
assert!(dict.check("affine"));
assert!(dict.check("affine"));
assert!(dict.check("affluent"));
Expand Down Expand Up @@ -2529,7 +2526,7 @@ mod test {
trazable/kSJ
"#;

let dict = Dictionary::new_with_hasher(dic, aff, RandomState::new()).unwrap();
let dict = Dictionary::new(dic, aff).unwrap();

// Stem
assert!(dict.check("perdurable"));
Expand Down Expand Up @@ -2579,7 +2576,7 @@ mod test {
stem/pi
"#;

let dict = Dictionary::new_with_hasher(dic, aff, RandomState::new()).unwrap();
let dict = Dictionary::new(dic, aff).unwrap();

assert!(dict.check("stem"));
assert!(dict.check("prestem"));
Expand Down Expand Up @@ -2617,7 +2614,7 @@ mod test {
stem2/p2s2
"#;

let dict = Dictionary::new_with_hasher(dic, aff, RandomState::new()).unwrap();
let dict = Dictionary::new(dic, aff).unwrap();
assert!(dict.aff_data.options.complex_prefixes);

assert!(dict.check("stem1"));
Expand Down
54 changes: 29 additions & 25 deletions src/hash_bag.rs
Original file line number Diff line number Diff line change
Expand Up @@ -33,18 +33,34 @@ pub struct HashBag<K, V, S> {
build_hasher: S,
}

impl<K, V, S> HashBag<K, V, S>
where
K: Hash + Eq,
S: BuildHasher,
{
pub fn with_hasher(build_hasher: S) -> Self {
impl<K, V, S: BuildHasher + Default> HashBag<K, V, S> {
pub fn new() -> Self {
Self {
table: RawTable::new(),
build_hasher,
build_hasher: S::default(),
}
}
}

impl<K, V, S> HashBag<K, V, S> {
pub fn iter(&self) -> Iter<'_, K, V> {
// Here we tie the lifetime of self to the iter.
Iter {
inner: unsafe { self.table.iter() },
marker: PhantomData,
}
}

pub fn len(&self) -> usize {
self.table.len()
}
}

impl<K, V, S> HashBag<K, V, S>
where
K: Hash + Eq,
S: BuildHasher,
{
pub fn with_capacity_and_hasher(capacity: usize, build_hasher: S) -> Self {
Self {
table: RawTable::with_capacity(capacity),
Expand All @@ -60,18 +76,6 @@ where
self.table.insert(hash, (k, v), hasher);
}

pub fn iter(&self) -> Iter<'_, K, V> {
// Here we tie the lifetime of self to the iter.
Iter {
inner: unsafe { self.table.iter() },
marker: PhantomData,
}
}

pub fn len(&self) -> usize {
self.table.len()
}

pub fn get<'map, 'key, Q>(&'map self, k: &'key Q) -> GetAllIter<'map, 'key, Q, K, V>
where
K: Borrow<Q>,
Expand All @@ -92,7 +96,6 @@ impl<K, V, S> Debug for HashBag<K, V, S>
where
K: Debug + Hash + Eq,
V: Debug,
S: BuildHasher,
{
fn fmt(&self, f: &mut core::fmt::Formatter<'_>) -> core::fmt::Result {
f.debug_map().entries(self.iter()).finish()
Expand Down Expand Up @@ -218,12 +221,13 @@ where
#[cfg(test)]
mod test {
use crate::alloc::{string::ToString, vec::Vec};
use crate::DefaultHashBuilder;

use super::*;
type HashBag<K, V> = super::HashBag<K, V, DefaultHashBuilder>;

#[test]
fn insert_and_get_duplicate_keys() {
let mut map = HashBag::with_hasher(ahash::RandomState::new());
let mut map = HashBag::new();
map.insert(1, 1);
map.insert(5, 5);
assert!(map.len() == 2);
Expand All @@ -240,7 +244,7 @@ mod test {

#[test]
fn string_keys() {
let mut map = HashBag::with_hasher(ahash::RandomState::new());
let mut map = HashBag::new();
map.insert("hello".to_string(), "bob");
map.insert("hello".to_string(), "world");
map.insert("bye".to_string(), "bob");
Expand All @@ -257,7 +261,7 @@ mod test {
fn iter() {
// The iterator is currently unused but very small and could be useful for debugging.
let pairs = &[(1, 1), (1, 2), (1, 3), (3, 1)];
let mut map = HashBag::with_capacity_and_hasher(pairs.len(), ahash::RandomState::new());
let mut map = HashBag::new();
for (k, v) in pairs {
map.insert(k, v);
}
Expand All @@ -273,7 +277,7 @@ mod test {
fn display() {
// Shameless coverage test, it brings the file to 100% :P
let pairs = &[(1, 1), (1, 1), (1, 2), (1, 3), (3, 1)];
let mut map = HashBag::with_capacity_and_hasher(
let mut map = super::HashBag::with_capacity_and_hasher(
pairs.len(),
// We use a hard-coded seed so that the display is deterministic.
ahash::RandomState::with_seeds(123, 456, 789, 1000),
Expand Down
16 changes: 13 additions & 3 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,18 @@ use checker::Checker;
use core::{cmp::Ordering, fmt, hash::BuildHasher};
use hash_bag::HashBag;

/// Default hasher for hash tables.
#[cfg(feature = "default-hasher")]
pub type DefaultHashBuilder = core::hash::BuildHasherDefault<ahash::AHasher>;

/// Dummy default hasher for hash tables.
#[cfg(not(feature = "default-hasher"))]
pub enum DefaultHashBuilder {}

// Allow passing down an Allocator too?

/// TODO
pub struct Dictionary<S: BuildHasher> {
pub struct Dictionary<S = DefaultHashBuilder> {
aff_data: AffData<S>,
}

Expand All @@ -35,9 +45,9 @@ impl<S: BuildHasher + Clone> Dictionary<S> {
}
}

impl<S: BuildHasher + Clone + Default> Dictionary<S> {
impl Dictionary<DefaultHashBuilder> {
pub fn new(dic: &str, aff: &str) -> Result<Self, ParseDictionaryError> {
Self::new_with_hasher(dic, aff, S::default())
Self::new_with_hasher(dic, aff, DefaultHashBuilder::default())
}
}

Expand Down
3 changes: 1 addition & 2 deletions tests/corpus.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
use ahash::RandomState;
use spellbook::Dictionary;
use std::{
fs::{self, File},
Expand All @@ -20,7 +19,7 @@ fn check() {
let dic = read_to_string(path).unwrap();
let aff = read_to_string(path.with_extension("aff")).unwrap();

let dict = Dictionary::new_with_hasher(&dic, &aff, RandomState::new()).unwrap();
let dict = Dictionary::new(&dic, &aff).unwrap();

for good_word in fs::read_to_string(path.with_extension("good"))
.iter()
Expand Down

0 comments on commit cb5e9e1

Please sign in to comment.