|
| 1 | +// Package mash provides sequence MinHashing and Mash distance calculation. |
| 2 | +package mash |
| 3 | + |
| 4 | +import ( |
| 5 | + "bytes" |
| 6 | + "math" |
| 7 | + |
| 8 | + "github.com/fluhus/biostuff/sequtil/v2" |
| 9 | + "github.com/fluhus/gostuff/minhash" |
| 10 | + "github.com/spaolacci/murmur3" |
| 11 | +) |
| 12 | + |
| 13 | +// Seed is the hash seed. |
| 14 | +// Affects subsequent calls to [Sequences] and [Add]. |
| 15 | +var Seed uint32 = 0 |
| 16 | + |
| 17 | +// Add adds the given sequences to an existing MinHash |
| 18 | +// using subsequences of length k. |
| 19 | +// Equivalent to calling [Sequences] on the old and new sequences together. |
| 20 | +func Add(mh *minhash.MinHash[uint64], k int, seqs ...[]byte) { |
| 21 | + h := murmur3.New64WithSeed(Seed) |
| 22 | + for _, seq := range seqs { |
| 23 | + for b := range sequtil.CanonicalSubsequences(bytes.ToUpper(seq), k) { |
| 24 | + h.Reset() |
| 25 | + h.Write(b) |
| 26 | + mh.Push(h.Sum64()) |
| 27 | + } |
| 28 | + } |
| 29 | + mh.Sort() |
| 30 | +} |
| 31 | + |
| 32 | +// Sequences returns a single MinHash for seqs with n elements and for |
| 33 | +// subsequences of length k. |
| 34 | +func Sequences(n, k int, seqs ...[]byte) *minhash.MinHash[uint64] { |
| 35 | + mh := minhash.New[uint64](n) |
| 36 | + Add(mh, k, seqs...) |
| 37 | + return mh |
| 38 | +} |
| 39 | + |
| 40 | +// Distance returns the Mash distance between two MinHash collections. |
| 41 | +func Distance(mh1, mh2 *minhash.MinHash[uint64], k int) float64 { |
| 42 | + return FromJaccard(mh1.Jaccard(mh2), k) |
| 43 | +} |
| 44 | + |
| 45 | +// FromJaccard returns the Mash distance given a Jaccard similarity. |
| 46 | +func FromJaccard(jac float64, k int) float64 { |
| 47 | + if jac == 0 { |
| 48 | + return 1 |
| 49 | + } |
| 50 | + return min(-math.Log(2*jac/(1+jac))/float64(k), 1) |
| 51 | +} |
0 commit comments