Skip to content

Commit

Permalink
first init
Browse files Browse the repository at this point in the history
  • Loading branch information
jiacai2050 committed Jun 8, 2023
0 parents commit 39dcad3
Show file tree
Hide file tree
Showing 8 changed files with 360 additions and 0 deletions.
38 changes: 38 additions & 0 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
name: CI

on:
workflow_dispatch:
push:
branches:
- main
paths-ignore:
- '**.md'
pull_request:
branches:
- main
paths-ignore:
- '**.md'

env:
RUSTFLAGS: "-C debuginfo=1"
CARGO_TERM_COLOR: always
RUST_BACKTRACE: "1"

jobs:
sqlness:
runs-on: ubuntu-latest
timeout-minutes: 30
strategy:
matrix:
rust: [stable]
steps:
- uses: actions/checkout@v3
with:
submodules: true
- run: |
rustup set auto-self-update disable
rustup toolchain install ${{ matrix.rust }} --profile minimal
- name: Run Style Check
run: |
make clippy
make fmt
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
/target
124 changes: 124 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

17 changes: 17 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
[package]
name = "hash-benchmark-rs"
description = "Benchmark different hash in Rust"
version = "0.1.0"
edition = "2021"
authors = [
"CeresDB Authors <ceresdbservice@gmail.com>",
]
license = "Apache-2.0"
repository = "https://github.com/CeresDB/hash-benchmark-rs"

[dependencies]
ahash = "0.8.3"
byteorder = "1.4.3"
murmur3 = "0.4.1"
rand = "0.8.5"
seahash = "4.1.0"
9 changes: 9 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@

run:
cargo run --release

fmt:
cargo fmt -- --check

clippy:
cargo clippy --all-targets --all-features --workspace -- -D warnings
31 changes: 31 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
# Hash Benchmark

# How to run
```bash
make run
```

# Result

## Random string, key_num(10000000), key_len(100)
| Op | Default | AHash | Murmur | SeaHasher |
| --- | --- | --- | --- | --- |
| build time | 441.487 | 12.979 | 462.289 | 141.508 |
| std dev | 283.269 | 259.322 | 274.551 | 294.800 |
| collision | 0 | 0 | 0 | 0 |


## Increasing number
| Op | Default | AHash | Murmur | SeaHasher |
| --- | --- | --- | --- | --- |
| build time | 46.265 | 12.492 | 174.967 | 82.611 |
| std dev | 263.976 | 266.246 | 275.889 | 272.080 |
| collision | 0 | 0 | 0 | 0 |


# Conclusion

- [Ahash](https://github.com/tkaitchuck/aHash) is fastest, but it doesn't guarantee fixed hash code, so it's only recommended used in memory structures.
- Hash code generated by those hash is almost same(evenly distributed), no big difference.
- [SeaHash](https://docs.rs/seahash/latest/seahash/) generate fixed hash code, and its speed is not very bad, so it's suitable for on-disk/permanent storage.
- [DefaultHash](https://doc.rust-lang.org/std/collections/hash_map/struct.DefaultHasher.html) in std is pretty good, but hash code generated by it may change over rust release.
105 changes: 105 additions & 0 deletions src/main.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,105 @@
mod util;

use ahash::AHasher;
use seahash::SeaHasher;
use std::collections::hash_map::DefaultHasher;
use std::collections::HashSet;
use std::hash::Hasher;
use std::time::{Duration, Instant};
use util::MurmurHasher;

use crate::util::gen_random_string;

const KEY_NUM: usize = 10_000_000;
const KEY_LEN: usize = 100;
const BUCKET_LEN: usize = 128;

fn test_speed<H: Hasher + Default>(keys: &[String]) -> Duration {
let start_time = Instant::now();
for key in keys {
let mut hasher = H::default();
hasher.write(key.as_bytes());
hasher.finish();
}

start_time.elapsed()
}

fn test_collisions<H: Hasher + Default>(keys: &[String]) -> usize {
let mut dedup = HashSet::with_capacity(keys.len());
for key in keys {
let mut hasher = H::default();
hasher.write(key.as_bytes());
dedup.insert(hasher.finish());
}

keys.len() - dedup.len()
}

fn test_distribution<H: Hasher + Default>(keys: &[String]) -> f64 {
let mut buckets = vec![0; BUCKET_LEN];
for key in keys {
let mut hasher = H::default();
hasher.write(key.as_bytes());
let idx = hasher.finish() as usize % BUCKET_LEN;
buckets[idx] += 1;
}

let mean = buckets.iter().sum::<usize>() as f64 / BUCKET_LEN as f64;
let variance = buckets
.iter()
.map(|n| {
let diff = *n as f64 - mean;
diff * diff
})
.sum::<f64>()
/ BUCKET_LEN as f64;

// std_dev
variance.sqrt()
}

fn main() {
let keys: Vec<_> = (0..KEY_NUM).map(|_| gen_random_string(KEY_LEN)).collect();
println!(
"## Random string, key_num({}), key_len({})",
KEY_NUM, KEY_LEN
);
run(&keys);

let keys: Vec<_> = (0..KEY_NUM).map(|i| i.to_string()).collect();
println!("\n\n ## Increasing number");
run(&keys)
}

fn run(keys: &[String]) {
// Current print as markdown table, maybe we can add more format
// https://github.com/phsym/prettytable-rs/

let as_ms = |v| -> f64 { v as f64 / 1000_f64 };

println!("| Op | Default | AHash | Murmur | SeaHasher |");
println!("| --- | --- | --- | --- | --- |");
println!(
"| build time | {:.3} | {:.3} | {:.3} | {:.3} |",
as_ms(test_speed::<DefaultHasher>(keys).as_micros()),
as_ms(test_speed::<AHasher>(keys).as_micros()),
as_ms(test_speed::<MurmurHasher>(keys).as_micros()),
as_ms(test_speed::<SeaHasher>(keys).as_micros()),
);
println!(
"| std dev | {:.3} | {:.3} | {:.3} | {:.3} |",
test_distribution::<DefaultHasher>(keys),
test_distribution::<AHasher>(keys),
test_distribution::<MurmurHasher>(keys),
test_distribution::<SeaHasher>(keys),
);

println!(
"| collision | {} | {} | {} | {} |",
test_collisions::<DefaultHasher>(keys),
test_collisions::<DefaultHasher>(keys),
test_collisions::<DefaultHasher>(keys),
test_collisions::<DefaultHasher>(keys),
);
}
35 changes: 35 additions & 0 deletions src/util.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
use std::hash::Hasher;

use byteorder::{ByteOrder, LittleEndian};
use rand::Rng;

pub fn hash64(mut bytes: &[u8]) -> u64 {
use murmur3::murmur3_x64_128;

let mut out = [0; 16];
murmur3_x64_128(&mut bytes, 0, &mut out);
// in most cases we run on little endian target
LittleEndian::read_u64(&out[0..8])
}

#[derive(Debug, Default)]
pub struct MurmurHasher(u64);

impl Hasher for MurmurHasher {
fn finish(&self) -> u64 {
self.0
}

fn write(&mut self, bytes: &[u8]) {
self.0 ^= hash64(bytes);
}
}

pub fn gen_random_string(length: usize) -> String {
let mut rng = rand::thread_rng();
let chars: Vec<char> = (0..length)
.map(|_| rng.gen_range(0..36))
.map(|n| if n < 26 { (n + 97) as u8 } else { (n - 26 + 48) as u8 } as char)
.collect();
chars.iter().collect()
}

0 comments on commit 39dcad3

Please sign in to comment.