Skip to content

Commit 23c6816

Browse files
committed
Add mash package
1 parent 0d01820 commit 23c6816

File tree

2 files changed

+101
-0
lines changed

2 files changed

+101
-0
lines changed

mash/mash.go

Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
// Package mash provides sequence MinHashing and Mash distance calculation.
2+
package mash
3+
4+
import (
5+
"bytes"
6+
"math"
7+
8+
"github.com/fluhus/biostuff/sequtil/v2"
9+
"github.com/fluhus/gostuff/minhash"
10+
"github.com/spaolacci/murmur3"
11+
)
12+
13+
// Seed is the hash seed.
14+
// Affects subsequent calls to [Sequences] and [Add].
15+
var Seed uint32 = 0
16+
17+
// Add adds the given sequences to an existing MinHash
18+
// using subsequences of length k.
19+
// Equivalent to calling [Sequences] on the old and new sequences together.
20+
func Add(mh *minhash.MinHash[uint64], k int, seqs ...[]byte) {
21+
h := murmur3.New64WithSeed(Seed)
22+
for _, seq := range seqs {
23+
for b := range sequtil.CanonicalSubsequences(bytes.ToUpper(seq), k) {
24+
h.Reset()
25+
h.Write(b)
26+
mh.Push(h.Sum64())
27+
}
28+
}
29+
mh.Sort()
30+
}
31+
32+
// Sequences returns a single MinHash for seqs with n elements and for
33+
// subsequences of length k.
34+
func Sequences(n, k int, seqs ...[]byte) *minhash.MinHash[uint64] {
35+
mh := minhash.New[uint64](n)
36+
Add(mh, k, seqs...)
37+
return mh
38+
}
39+
40+
// Distance returns the Mash distance between two MinHash collections.
41+
func Distance(mh1, mh2 *minhash.MinHash[uint64], k int) float64 {
42+
return FromJaccard(mh1.Jaccard(mh2), k)
43+
}
44+
45+
// FromJaccard returns the Mash distance given a Jaccard similarity.
46+
func FromJaccard(jac float64, k int) float64 {
47+
if jac == 0 {
48+
return 1
49+
}
50+
return min(-math.Log(2*jac/(1+jac))/float64(k), 1)
51+
}

mash/mash_test.go

Lines changed: 50 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,50 @@
1+
package mash
2+
3+
import (
4+
"math"
5+
"math/rand/v2"
6+
"slices"
7+
"testing"
8+
9+
"github.com/fluhus/biostuff/sequtil/v2"
10+
)
11+
12+
func TestSequences(t *testing.T) {
13+
seqs := [][]byte{
14+
[]byte("AGATTTTTCTCCCAACGAAACTTTACAGCACGCTAGTTTACGGGCACTCC"),
15+
[]byte("GCCCTATCTGGGGAGAAATCGTAGTGAGAGACCGAGGTGGCCCCACGCAC"),
16+
[]byte("TACGACTGGAAGAGCCCATGCACGGATGCTGCTACTCGCATTGGTTTACG"),
17+
}
18+
mh1 := Sequences(5, 21, seqs...)
19+
mh2 := Sequences(5, 21, seqs[0])
20+
Add(mh2, 21, seqs[1])
21+
Add(mh2, 21, seqs[2])
22+
23+
v1 := mh1.View()
24+
v2 := mh2.View()
25+
if !slices.Equal(v1, v2) {
26+
t.Fatalf("Sequences(...) != Add(...), %v != %v", v1, v2)
27+
}
28+
}
29+
30+
func TestDistance(t *testing.T) {
31+
const n = 1000000
32+
33+
// Create sequence.
34+
seq1 := make([]byte, n)
35+
for i := range seq1 {
36+
seq1[i] = sequtil.Iton(rand.N(4))
37+
}
38+
39+
// Create mutated sequence.
40+
seq2 := slices.Clone(seq1)
41+
for _, i := range rand.Perm(n)[:n/20] {
42+
seq2[i] = sequtil.Iton((sequtil.Ntoi(seq2[i]) + 1 + rand.N(3)) % 4)
43+
}
44+
45+
d := Distance(Sequences(10000, 21, seq1), Sequences(10000, 21, seq2), 21)
46+
dif := math.Abs(d - 0.05)
47+
if dif > 0.0001 {
48+
t.Fatalf("Distance(...)=%v, want %v", d, 0.05)
49+
}
50+
}

0 commit comments

Comments
 (0)