diff --git a/go.mod b/go.mod index 9150b5c..03e78b2 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module github.com/fluhus/biostuff -go 1.21 +go 1.22 require ( github.com/fluhus/gostuff v0.4.0 diff --git a/sequtil/v2/amino.go b/sequtil/v2/amino.go new file mode 100644 index 0000000..493408e --- /dev/null +++ b/sequtil/v2/amino.go @@ -0,0 +1,157 @@ +// Functions for amino acids. + +package sequtil + +import ( + "fmt" +) + +const ( + // AminoAcids holds the single-letter amino acid symbols. + // These are the valid inputs to AminoName. + AminoAcids = "ABCDEFGHIKLMNPQRSTVWXYZ*" +) + +// Translate translates the nucleotides in src to amino acids, appends the result to +// dst and returns the new slice. Nucleotides should be in "aAcCgGtT". Length of src +// should be a multiple of 3. +func Translate(dst, src []byte) []byte { + if len(src)%3 != 0 { + panic(fmt.Sprintf("length of src should be a multiple of 3, got %v", + len(src))) + } + var buf [3]byte + for i := 0; i < len(src); i += 3 { + copy(buf[:], src[i:i+3]) + for j := range buf { + if buf[j] >= 'a' { + buf[j] -= 'a' - 'A' + } + } + aa := codonToAmino[buf] + if aa == 0 { + panic(fmt.Sprintf("bad codon at position %v: %q", i, src[i:i+3])) + } + dst = append(dst, aa) + } + return dst +} + +// TranslateReadingFrames returns the translation of the 3 reading frames of seq. +// Nucleotides should be in "aAcCgGtT". seq can be of any length. +func TranslateReadingFrames(seq []byte) [3][]byte { + var result [3][]byte + for i := 0; i < 3; i++ { + sub := seq[i:] + sub = sub[:len(sub)/3*3] + result[i] = Translate(nil, sub) + } + return result +} + +// AminoName returns the 3-letter code and the full name of the amino acid with the +// given letter. Input may be uppercase or lowercase. +func AminoName(aa byte) (string, string) { + if aa >= 'a' && aa <= 'z' { + aa -= 'a' - 'A' + } + names, ok := aminoToName[aa] + if !ok { + panic(fmt.Sprintf("bad amino acid code: '%c'", aa)) + } + return names[0], names[1] +} + +var codonToAmino = map[[3]byte]byte{ + {'A', 'A', 'A'}: 'K', + {'A', 'A', 'C'}: 'N', + {'A', 'A', 'G'}: 'K', + {'A', 'A', 'T'}: 'N', + {'A', 'C', 'A'}: 'T', + {'A', 'C', 'C'}: 'T', + {'A', 'C', 'G'}: 'T', + {'A', 'C', 'T'}: 'T', + {'A', 'G', 'A'}: 'R', + {'A', 'G', 'C'}: 'S', + {'A', 'G', 'G'}: 'R', + {'A', 'G', 'T'}: 'S', + {'A', 'T', 'A'}: 'I', + {'A', 'T', 'C'}: 'I', + {'A', 'T', 'G'}: 'M', + {'A', 'T', 'T'}: 'I', + {'C', 'A', 'A'}: 'Q', + {'C', 'A', 'C'}: 'H', + {'C', 'A', 'G'}: 'Q', + {'C', 'A', 'T'}: 'H', + {'C', 'C', 'A'}: 'P', + {'C', 'C', 'C'}: 'P', + {'C', 'C', 'G'}: 'P', + {'C', 'C', 'T'}: 'P', + {'C', 'G', 'A'}: 'R', + {'C', 'G', 'C'}: 'R', + {'C', 'G', 'G'}: 'R', + {'C', 'G', 'T'}: 'R', + {'C', 'T', 'A'}: 'L', + {'C', 'T', 'C'}: 'L', + {'C', 'T', 'G'}: 'L', + {'C', 'T', 'T'}: 'L', + {'G', 'A', 'A'}: 'E', + {'G', 'A', 'C'}: 'D', + {'G', 'A', 'G'}: 'E', + {'G', 'A', 'T'}: 'D', + {'G', 'C', 'A'}: 'A', + {'G', 'C', 'C'}: 'A', + {'G', 'C', 'G'}: 'A', + {'G', 'C', 'T'}: 'A', + {'G', 'G', 'A'}: 'G', + {'G', 'G', 'C'}: 'G', + {'G', 'G', 'G'}: 'G', + {'G', 'G', 'T'}: 'G', + {'G', 'T', 'A'}: 'V', + {'G', 'T', 'C'}: 'V', + {'G', 'T', 'G'}: 'V', + {'G', 'T', 'T'}: 'V', + {'T', 'A', 'A'}: '*', + {'T', 'A', 'C'}: 'Y', + {'T', 'A', 'G'}: '*', + {'T', 'A', 'T'}: 'Y', + {'T', 'C', 'A'}: 'S', + {'T', 'C', 'C'}: 'S', + {'T', 'C', 'G'}: 'S', + {'T', 'C', 'T'}: 'S', + {'T', 'G', 'A'}: '*', + {'T', 'G', 'C'}: 'C', + {'T', 'G', 'G'}: 'W', + {'T', 'G', 'T'}: 'C', + {'T', 'T', 'A'}: 'L', + {'T', 'T', 'C'}: 'F', + {'T', 'T', 'G'}: 'L', + {'T', 'T', 'T'}: 'F', +} + +var aminoToName = map[byte][2]string{ + 'A': {"Ala", "Alanine"}, + 'B': {"Asx", "Asparagine"}, + 'C': {"Cys", "Cysteine"}, + 'D': {"Asp", "Aspartic"}, + 'E': {"Glu", "Glutamic"}, + 'F': {"Phe", "Phenylalanine"}, + 'G': {"Gly", "Glycine"}, + 'H': {"His", "Histidine"}, + 'I': {"Ile", "Isoleucine"}, + 'K': {"Lys", "Lysine"}, + 'L': {"Leu", "Leucine"}, + 'M': {"Met", "Methionine"}, + 'N': {"Asn", "Asparagine"}, + 'P': {"Pro", "Proline"}, + 'Q': {"Gln", "Glutamine"}, + 'R': {"Arg", "Arginine"}, + 'S': {"Ser", "Serine"}, + 'T': {"Thr", "Threonine"}, + 'V': {"Val", "Valine"}, + 'W': {"Trp", "Tryptophan"}, + 'X': {"X", "Any codon"}, + 'Y': {"Tyr", "Tyrosine"}, + 'Z': {"Glx", "Glutamine"}, + '*': {"*", "Stop codon"}, +} diff --git a/sequtil/v2/animo_test.go b/sequtil/v2/animo_test.go new file mode 100644 index 0000000..057efe4 --- /dev/null +++ b/sequtil/v2/animo_test.go @@ -0,0 +1,65 @@ +package sequtil + +import ( + "reflect" + "testing" +) + +func TestTranslate(t *testing.T) { + input := "AGAcatTGGgat" + want := "RHWD" + got := Translate(nil, []byte(input)) + if string(got) != want { + t.Fatalf("Translate(%q)=%q, want %q", input, got, want) + } +} + +func TestTranslate_badBase(t *testing.T) { + defer func() { recover() }() + input := "AGAcatTGGgad" + Translate(nil, []byte(input)) + t.Fatalf("Translate(%q) succeeded, want panic", input) +} + +func TestTranslate_badLength(t *testing.T) { + defer func() { recover() }() + input := "AGAcatTGGgatt" + Translate(nil, []byte(input)) + t.Fatalf("Translate(%q) succeeded, want panic", input) +} + +func TestTranslateReadingFrames(t *testing.T) { + input := "AGAcatTGGgat" + want := [3][]byte{[]byte("RHWD"), []byte("DIG"), []byte("TLG")} + if got := TranslateReadingFrames([]byte(input)); !reflect.DeepEqual(got, want) { + t.Fatalf("TranslateReadingFrames(%q)=%v, want %v", input, got, want) + } +} + +func TestAminoName(t *testing.T) { + tests := []struct { + input byte + want1 string + want2 string + }{ + {'H', "His", "Histidine"}, + {'Q', "Gln", "Glutamine"}, + {'h', "His", "Histidine"}, + {'q', "Gln", "Glutamine"}, + {'*', "*", "Stop codon"}, + } + + for _, test := range tests { + got1, got2 := AminoName(test.input) + if got1 != test.want1 || got2 != test.want2 { + t.Fatalf("AminoName('%c')=(%q,%q), want (%q,%q)", + test.input, got1, got2, test.want1, test.want2) + } + } +} + +func TestAminoName_bad(t *testing.T) { + defer func() { recover() }() + AminoName('U') + t.Fatalf("AminoName('U') succeeded, want panic") +} diff --git a/sequtil/v2/sequtil.go b/sequtil/v2/sequtil.go new file mode 100644 index 0000000..b356b08 --- /dev/null +++ b/sequtil/v2/sequtil.go @@ -0,0 +1,210 @@ +// Package sequtil provides functions for processing genetic sequences. +// +// # Data Type For Sequences +// +// In this repository, sequences are represented as byte slices. This is meant +// to keep them familiar and predictable. This design favors having a buffet of +// functions for manipulating basic types, over having a dedicated sequence type with +// methods. +// +// # Mutating Sequences +// +// Mutations can be made using basic slice operations. +// +// // Substitution +// seq[4] = 'G' +// +// // Deletion of bases 10-12 (including 12) +// copy(seq[10:], seq[13:]) +// seq = seq[:len(seq)-3] +// +// // Insertion at position 4 +// insert := []byte{...} +// seq = append(append(append([]byte{}, seq[:4]...), insert...), seq[4:]...) +// +// # The U Nucleotide +// +// This package currently ignores the existence of uracil. Adding support for uracil +// means increasing the complexity of the API without adding new capabilities. The +// current solution is to substitute U's with T's before calling this package. +// +// This may change in the future, keeping backward compatibility. +package sequtil + +import ( + "bytes" + "fmt" + "strings" +) + +// Maps nucleotide byte value to its int value. +var ntoi []int + +func init() { + // Initialize ntoi values. + ntoi = make([]int, 256) + for i := range ntoi { + ntoi[i] = -1 + } + ntoi['a'], ntoi['A'] = 0, 0 + ntoi['c'], ntoi['C'] = 1, 1 + ntoi['g'], ntoi['G'] = 2, 2 + ntoi['t'], ntoi['T'] = 3, 3 +} + +// Ntoi converts a nucleotide to an int. +// Aa:0 Cc:1 Gg:2 Tt:3. Other values return -1. +func Ntoi(nuc byte) int { + return ntoi[nuc] +} + +// Iton converts an int to a nucleotide character. +// 0:A 1:C 2:G 3:T. Other values return N. +func Iton(num int) byte { + switch num { + case 0: + return 'A' + case 1: + return 'C' + case 2: + return 'G' + case 3: + return 'T' + default: + return 'N' + } +} + +// ReverseComplement appends to dst the reverse complement of src and returns +// the new dst. Characters not in "aAcCgGtTnN" will cause a panic. +func ReverseComplement(dst, src []byte) []byte { + for i := len(src) - 1; i >= 0; i-- { + b := src[i] + switch b { + case 'a': + dst = append(dst, 't') + case 'c': + dst = append(dst, 'g') + case 'g': + dst = append(dst, 'c') + case 't': + dst = append(dst, 'a') + case 'A': + dst = append(dst, 'T') + case 'C': + dst = append(dst, 'G') + case 'G': + dst = append(dst, 'C') + case 'T': + dst = append(dst, 'A') + case 'N': + dst = append(dst, 'N') + case 'n': + dst = append(dst, 'n') + default: + panic(fmt.Sprintf("Unexpected base value: %q, want aAcCgGtTnN", b)) + } + } + return dst +} + +// ReverseComplementString returns the reverse complement of s. +// Characters not in "aAcCgGtTnN" will cause a panic. +func ReverseComplementString(s string) string { + builder := &strings.Builder{} + builder.Grow(len(s)) + for i := len(s) - 1; i >= 0; i-- { + b := s[i] + switch b { + case 'a': + builder.WriteByte('t') + case 'c': + builder.WriteByte('g') + case 'g': + builder.WriteByte('c') + case 't': + builder.WriteByte('a') + case 'A': + builder.WriteByte('T') + case 'C': + builder.WriteByte('G') + case 'G': + builder.WriteByte('C') + case 'T': + builder.WriteByte('A') + case 'N': + builder.WriteByte('N') + case 'n': + builder.WriteByte('n') + default: + panic(fmt.Sprintf("Unexpected base value: %q, want aAcCgGtTnN", b)) + } + } + return builder.String() +} + +// DNATo2Bit appends to dst the 2-bit representation of the DNA sequence in src, +// and returns the new dst. Characters not in "aAcCgGtT" will cause a panic. +func DNATo2Bit(dst, src []byte) []byte { + dn := len(dst) + for i, b := range src { + di := dn + i/4 + shift := 6 - i%4*2 // Make the first character the most significant. + if shift == 6 { + // Starting a new byte. + dst = append(dst, 0) + } + dbInt := Ntoi(b) + if dbInt == -1 { + panic(fmt.Sprintf("Unexpected base value: %q, want aAcCgGtT", b)) + } + db := byte(dbInt) << shift + dst[di] |= db + } + return dst +} + +// DNAFrom2Bit appends to dst the nucleotides represented in 2-bit in src and +// returns the new dst. Only outputs characters in "ACGT". +func DNAFrom2Bit(dst, src []byte) []byte { + for i := 0; i < len(src); i++ { + dst = append(dst, dnaFrom2bit[src[i]][:]...) + } + return dst +} + +// Maps 2-bit value to its expanded representation. +var dnaFrom2bit = make([][4]byte, 256) + +// Initializes the dnaFrom2bit slice. +func init() { + val := make([]byte, 4) + for i := 0; i < 256; i++ { + for j := 0; j < 4; j++ { + // First base is the most significant digit. + val[3-j] = Iton((i >> (2 * j)) & 3) + } + copy(dnaFrom2bit[i][:], val) + } +} + +// CanonicalSubsequences iterates over canonical k-long subsequences of seq. +// A canonical sequence is the lexicographically lesser out of a sequence and +// its reverse complement. +// Makes one call to ReverseComplement. +func CanonicalSubsequences(seq []byte, k int) func(yield func([]byte) bool) { + return func(yield func([]byte) bool) { + rc := ReverseComplement(make([]byte, 0, len(seq)), seq) + nk := len(seq) - k + 1 + for i := 0; i < nk; i++ { + kmer := seq[i : i+k] + kmerRC := rc[len(rc)-i-k : len(rc)-i] + if bytes.Compare(kmer, kmerRC) == 1 { + kmer = kmerRC + } + if !yield(kmer) { + return + } + } + } +} diff --git a/sequtil/v2/sequtil_test.go b/sequtil/v2/sequtil_test.go new file mode 100644 index 0000000..756ab6d --- /dev/null +++ b/sequtil/v2/sequtil_test.go @@ -0,0 +1,130 @@ +package sequtil + +import ( + "reflect" + "testing" + + "golang.org/x/exp/slices" +) + +func TestReverseComplement(t *testing.T) { + tests := []struct { + input string + want string + }{ + {"A", "T"}, + {"AAA", "TTT"}, + {"aaa", "ttt"}, + {"AACTTGGG", "CCCAAGTT"}, + {"TGTGTG", "CACACA"}, + {"", ""}, + } + var got []byte + for _, test := range tests { + got = ReverseComplement(got[:0], []byte(test.input)) + if string(got) != test.want { + t.Errorf("ReverseComplement(%q)=%q, want %q", + test.input, got, test.want) + } + } +} + +func TestReverseComplementString(t *testing.T) { + tests := []struct { + input string + want string + }{ + {"A", "T"}, + {"AAA", "TTT"}, + {"aaa", "ttt"}, + {"AACTTGGG", "CCCAAGTT"}, + {"TGTGTG", "CACACA"}, + {"", ""}, + } + for _, test := range tests { + got := ReverseComplementString(test.input) + if got != test.want { + t.Errorf("ReverseComplementString(%q)=%v, want %v", + test.input, got, test.want) + } + } +} + +func TestDNATo2Bit(t *testing.T) { + tests := []struct { + input []byte + want []byte + }{ + {[]byte("acgtTGCA"), []byte{0b00011011, 0b11100100}}, + {[]byte("tatat"), []byte{0b11001100, 0b11000000}}, + {[]byte("ccc"), []byte{0b01010100}}, + } + var got []byte + for _, test := range tests { + got = DNATo2Bit(got[:0], test.input) + if !reflect.DeepEqual(got, test.want) { + t.Errorf("DNATo2Bit(%q)=%v, want %v", test.input, got, test.want) + } + } +} + +func TestDNAFrom2Bit(t *testing.T) { + tests := []struct { + input []byte + want []byte + }{ + {[]byte{0b00011011, 0b11100100}, []byte("ACGTTGCA")}, + {[]byte{0b11001100, 0b11000000}, []byte("TATATAAA")}, + {[]byte{0b01010100}, []byte("CCCA")}, + } + var got []byte + for _, test := range tests { + got = DNAFrom2Bit(got[:0], test.input) + if !reflect.DeepEqual(got, test.want) { + t.Errorf("DNAFrom2Bit(%v)=%q, want %q", test.input, got, test.want) + } + } +} + +func BenchmarkDNATo2Bit(b *testing.B) { + dna := []byte("acgtacgtacgtacgtacgtacgtacgtacgt") + var twobit []byte + b.ResetTimer() + for i := 0; i < b.N; i++ { + twobit = DNATo2Bit(twobit[:0], dna) + } +} + +func BenchmarkDNAFrom2Bit(b *testing.B) { + dna := []byte("acgtacgtacgtacgtacgtacgtacgtacgt") + twobit := DNATo2Bit(nil, dna) + b.ResetTimer() + for i := 0; i < b.N; i++ { + dna = DNAFrom2Bit(dna[:0], twobit) + } +} + +func FuzzDNA2Bit(f *testing.F) { + f.Add([]byte{}) + f.Fuzz(func(t *testing.T, a []byte) { + aa := slices.Clone(a) + b := DNAFrom2Bit(nil, a) + c := DNATo2Bit(nil, b) + if !slices.Equal(aa, c) { + t.Errorf("DNATo2Bit(DNAFrom2Bit(...)) before=%q after=%q", + aa, c) + } + }) +} + +func TestCanonical(t *testing.T) { + input := "GATTACCA" + want := []string{"GA", "AT", "AA", "TA", "AC", "CC", "CA"} + var got []string + for seq := range CanonicalSubsequences([]byte(input), 2) { + got = append(got, string(seq)) + } + if !slices.Equal(got, want) { + t.Fatalf("CanonicalSubsequences(%q)=%v, want %v", input, got, want) + } +}