From e3ca50c88ddd6cea5b1e2b47378d6e130ed5e94b Mon Sep 17 00:00:00 2001 From: Amit Lavon Date: Fri, 11 Oct 2024 14:55:15 -0700 Subject: [PATCH] Formats v2 initial commit --- formats/bed/bed.go | 4 +- formats/bed/v2/bed.go | 257 ++++++++++++++++++++++++++ formats/bed/v2/bed_test.go | 110 ++++++++++++ formats/bed/v2/iter.go | 40 +++++ formats/fasta/fasta.go | 2 + formats/fasta/v2/fasta.go | 140 +++++++++++++++ formats/fasta/v2/fasta_test.go | 106 +++++++++++ formats/fasta/v2/iter.go | 54 ++++++ formats/fastq/fastq.go | 2 + formats/fastq/v2/fastq.go | 109 +++++++++++ formats/fastq/v2/fastq_test.go | 65 +++++++ formats/fastq/v2/iter.go | 54 ++++++ formats/newick/newick.go | 2 + formats/newick/v2/newick.go | 279 +++++++++++++++++++++++++++++ formats/newick/v2/newick_test.go | 199 ++++++++++++++++++++ formats/newick/v2/traverse.go | 57 ++++++ formats/newick/v2/traverse_test.go | 55 ++++++ formats/sam/sam.go | 2 + formats/sam/v2/flag.go | 180 +++++++++++++++++++ formats/sam/v2/iter.go | 54 ++++++ formats/sam/v2/sam.go | 278 ++++++++++++++++++++++++++++ formats/sam/v2/sam_test.go | 154 ++++++++++++++++ 22 files changed, 2202 insertions(+), 1 deletion(-) create mode 100644 formats/bed/v2/bed.go create mode 100644 formats/bed/v2/bed_test.go create mode 100644 formats/bed/v2/iter.go create mode 100644 formats/fasta/v2/fasta.go create mode 100644 formats/fasta/v2/fasta_test.go create mode 100644 formats/fasta/v2/iter.go create mode 100644 formats/fastq/v2/fastq.go create mode 100644 formats/fastq/v2/fastq_test.go create mode 100644 formats/fastq/v2/iter.go create mode 100644 formats/newick/v2/newick.go create mode 100644 formats/newick/v2/newick_test.go create mode 100644 formats/newick/v2/traverse.go create mode 100644 formats/newick/v2/traverse_test.go create mode 100644 formats/sam/v2/flag.go create mode 100644 formats/sam/v2/iter.go create mode 100644 formats/sam/v2/sam.go create mode 100644 formats/sam/v2/sam_test.go diff --git a/formats/bed/bed.go b/formats/bed/bed.go index c131786..d37c370 100644 --- a/formats/bed/bed.go +++ b/formats/bed/bed.go @@ -3,11 +3,13 @@ // This package uses the format described in: // https://en.wikipedia.org/wiki/BED_(file_format) // -// Limitations +// # Limitations // // Currently only tab delimiters are supported. // // Currently BED headers are not supported. +// +// Deprecated: use v2. package bed import ( diff --git a/formats/bed/v2/bed.go b/formats/bed/v2/bed.go new file mode 100644 index 0000000..3649358 --- /dev/null +++ b/formats/bed/v2/bed.go @@ -0,0 +1,257 @@ +// Package bed decodes and encodes BED files. +// +// This package uses the format described in: +// https://en.wikipedia.org/wiki/BED_(file_format) +// +// # Limitations +// +// Currently only tab delimiters are supported. +// +// Currently BED headers are not supported. +package bed + +import ( + "bytes" + "encoding/csv" + "fmt" + "io" + "strconv" + "strings" +) + +// Valid values for the strand field. +const ( + PlusStrand = "+" + MinusStrand = "-" + NoStrand = "." +) + +// BED is a single line in a BED file. +type BED struct { + N int // Number of fields in this entry + Chrom string + ChromStart int // 0-based + ChromEnd int // 0-based exclusive + Name string + Score int + Strand string + ThickStart int + ThickEnd int + ItemRGB [3]byte + BlockCount int + BlockSizes []int // Length should match BlockCount + BlockStarts []int // Length should match BlockCount +} + +// Write writer the textual BED format representation of b to w. +// Encodes the first b.N fields, where b.N is between 3 and 12. +// Includes a trailing new line. +func (b *BED) Write(w io.Writer) error { + if b.N < 3 || b.N > 12 { + return fmt.Errorf("bad number of fields: %v, want 3-12", b.N) + } + if _, err := fmt.Fprintf(w, "%v\t%v\t%v", + b.Chrom, b.ChromStart, b.ChromEnd); err != nil { + return err + } + if b.N > 3 { + if _, err := fmt.Fprintf(w, "\t%v", b.Name); err != nil { + return err + } + } + if b.N > 4 { + if _, err := fmt.Fprintf(w, "\t%v", b.Score); err != nil { + return err + } + } + if b.N > 5 { + if _, err := fmt.Fprintf(w, "\t%v", b.Strand); err != nil { + return err + } + } + if b.N > 6 { + if _, err := fmt.Fprintf(w, "\t%v", b.ThickStart); err != nil { + return err + } + } + if b.N > 7 { + if _, err := fmt.Fprintf(w, "\t%v", b.ThickEnd); err != nil { + return err + } + } + if b.N > 8 { + if _, err := fmt.Fprintf(w, "\t%v,%v,%v", + b.ItemRGB[0], b.ItemRGB[1], b.ItemRGB[2]); err != nil { + return err + } + } + if b.N > 9 { + if _, err := fmt.Fprintf(w, "\t%v", b.BlockCount); err != nil { + return err + } + } + if b.N > 10 { + if _, err := fmt.Fprintf(w, "\t"); err != nil { + return err + } + for i, x := range b.BlockSizes { + txt := "%v" + if i > 0 { + txt = ",%v" + } + if _, err := fmt.Fprintf(w, txt, x); err != nil { + return err + } + } + } + if b.N > 11 { + if _, err := fmt.Fprintf(w, "\t"); err != nil { + return err + } + for i, x := range b.BlockStarts { + txt := "%v" + if i > 0 { + txt = ",%v" + } + if _, err := fmt.Fprintf(w, txt, x); err != nil { + return err + } + } + } + if _, err := fmt.Fprintf(w, "\n"); err != nil { + return err + } + return nil +} + +// MarshalText returns the textual representation of b in BED format. +// Encodes the first b.N fields, where b.N is between 3 and 12. +// Includes a trailing new line. +func (b *BED) MarshalText() ([]byte, error) { + buf := bytes.NewBuffer(nil) + if err := b.Write(buf); err != nil { + return nil, err + } + return buf.Bytes(), nil +} + +// Parses textual fields into a struct. Returns the number of parsed fields. +func parseLine(fields []string) (*BED, error) { + n := len(fields) + if n < 3 || n > 12 { + return nil, fmt.Errorf("bad number of fields: %v, want 3-12", n) + } + + // Force 12 fields to make parsing easy. + fields = append(fields, make([]string, 12-n)...) + bed := &BED{N: n} + var err error + + // Mandatory fields. + bed.Chrom = fields[0] + if bed.ChromStart, err = strconv.Atoi(fields[1]); err != nil { + return nil, fmt.Errorf("field 2: %v", err) + } + if bed.ChromEnd, err = strconv.Atoi(fields[2]); err != nil { + return nil, fmt.Errorf("field 3: %v", err) + } + + // Optional fields. + bed.Name = fields[3] + if fields[4] != "" { + if bed.Score, err = strconv.Atoi(fields[4]); err != nil { + return nil, fmt.Errorf("field 5: %v", err) + } + } + if fields[5] != "" && fields[5] != PlusStrand && + fields[5] != MinusStrand && fields[5] != NoStrand { + return nil, fmt.Errorf("field 6: bad strand: %q", fields[5]) + } + bed.Strand = fields[5] + if fields[6] != "" { + if bed.ThickStart, err = strconv.Atoi(fields[6]); err != nil { + return nil, fmt.Errorf("field 7: %v", err) + } + } + if fields[7] != "" { + if bed.ThickEnd, err = strconv.Atoi(fields[7]); err != nil { + return nil, fmt.Errorf("field 8: %v", err) + } + } + if fields[8] != "" { + rgb := strings.Split(fields[8], ",") + if len(rgb) != 3 { + return nil, fmt.Errorf("field 9: bad RGB value: %q", fields[8]) + } + for i := range rgb { + a, err := strconv.ParseUint(rgb[i], 0, 8) + if err != nil { + return nil, fmt.Errorf("field 9: bad RGB value: %q", fields[8]) + } + bed.ItemRGB[i] = byte(a) + } + } + if fields[9] != "" { + if bed.BlockCount, err = strconv.Atoi(fields[9]); err != nil { + return nil, fmt.Errorf("field 10: %v", err) + } + } + if fields[10] != "" { + sizes := strings.Split(fields[10], ",") + bed.BlockSizes = make([]int, len(sizes)) + for i := range sizes { + bed.BlockSizes[i], err = strconv.Atoi(sizes[i]) + if err != nil { + return nil, fmt.Errorf("field 11: %v", err) + } + } + } + if fields[11] != "" { + starts := strings.Split(fields[11], ",") + bed.BlockStarts = make([]int, len(starts)) + for i := range starts { + bed.BlockStarts[i], err = strconv.Atoi(starts[i]) + if err != nil { + return nil, fmt.Errorf("field 12: %v", err) + } + } + } + + if len(bed.BlockSizes) != bed.BlockCount { + return nil, fmt.Errorf("blockSizes has %v values but blockCount is %v", + len(bed.BlockSizes), bed.BlockCount) + } + if len(bed.BlockStarts) != bed.BlockCount { + return nil, fmt.Errorf("blockStarts has %v values but blockCount is %v", + len(bed.BlockStarts), bed.BlockCount) + } + + return bed, nil +} + +// A reader reads and parses BED lines. +type reader struct { + r *csv.Reader +} + +// newReader returns a new BED reader that reads from r. +func newReader(r io.Reader) *reader { + cr := csv.NewReader(r) + cr.Comma = '\t' + cr.Comment = '#' + return &reader{cr} +} + +// read returns the next BED line, and n as the number of fields that were found. +// The first n fields will be populated in the result BED, the rest will have zero +// values. n is always between 3 and 12. +// +// For example if n=5, then the populated fields are Chrom, ChromStart, ChromEnd, +// Name and Score. +func (r *reader) read() (b *BED, err error) { + line, err := r.r.Read() + if err != nil { + return nil, err + } + return parseLine(line) +} diff --git a/formats/bed/v2/bed_test.go b/formats/bed/v2/bed_test.go new file mode 100644 index 0000000..939be12 --- /dev/null +++ b/formats/bed/v2/bed_test.go @@ -0,0 +1,110 @@ +package bed + +import ( + "reflect" + "slices" + "strings" + "testing" +) + +func TestParseLine(t *testing.T) { + input := []string{"chr1", "10", "20", "Hello", "150", "+", "11", "13", + "50,100,150", "2", "40,60", "100,200"} + want := &BED{12, "chr1", 10, 20, "Hello", 150, "+", 11, 13, [3]byte{50, 100, 150}, + 2, []int{40, 60}, []int{100, 200}} + + // Full line. + got, err := parseLine(input) + if err != nil { + t.Fatalf("parseLine(%v) failed: %v", input, err) + } + if !reflect.DeepEqual(got, want) { + t.Fatalf("parseLine(%v)=%v want %v", input, got, want) + } + + // Partial line. + input = input[:6] + want = &BED{N: 6, Chrom: "chr1", ChromStart: 10, ChromEnd: 20, Name: "Hello", + Score: 150, Strand: "+"} + got, err = parseLine(input) + if err != nil { + t.Fatalf("parseLine(%v) failed: %v", input, err) + } + if !reflect.DeepEqual(got, want) { + t.Fatalf("parseLine(%v)=%v want %v", input, got, want) + } +} + +func TestParseLine_bad(t *testing.T) { + input := []string{"chr1", "10", "20", "Hello", "150", "+", "11", "13", + "50,100,150", "2", "40,60", "100,200"} + cp := slices.Clone(input) + + // Check good input. + if _, err := parseLine(cp); err != nil { + t.Fatalf("parseLine(%v) failed: %v", cp, err) + } + + // Make bad modifications. + if got, err := parseLine(cp[:2]); err == nil { + t.Fatalf("parseLine(%v)=%v want error", cp[:2], got) + } + cp[5] = "t" // Bad strand + if got, err := parseLine(cp); err == nil { + t.Fatalf("parseLine(%v)=%v want error", cp, got) + } + cp = slices.Clone(input) + cp[8] = "100" // Bad colors + if got, err := parseLine(cp); err == nil { + t.Fatalf("parseLine(%v)=%v want error", cp, got) + } + cp = slices.Clone(input) + cp[8] += "0" // Bad colors (overflow) + if got, err := parseLine(cp); err == nil { + t.Fatalf("parseLine(%v)=%v want error", cp, got) + } + cp = slices.Clone(input) + cp[10] += ",200" // Bad block starts + if got, err := parseLine(cp); err == nil { + t.Fatalf("parseLine(%v)=%v want error", cp, got) + } +} + +func TestReader(t *testing.T) { + input := "chr1\t10\t20\tHello\t150\t+\t11\t13\t50,100,150\t2\t40,60\t100,200\n" + want := []*BED{{12, "chr1", 10, 20, "Hello", 150, "+", 11, 13, [3]byte{50, 100, 150}, + 2, []int{40, 60}, []int{100, 200}}} + var got []*BED + + for bed, err := range Reader(strings.NewReader(input)) { + if err != nil { + t.Fatalf("Next() failed: %v", err) + } + got = append(got, bed) + } + if !reflect.DeepEqual(got, want) { + t.Errorf("Next()=%v want %v", got, want) + } +} + +func TestMarshalText(t *testing.T) { + want := "chr1\t10\t20\tHello\t150\t+\t11\t13\t50,100,150\t2\t40,60\t100,200\n" + input := &BED{12, "chr1", 10, 20, "Hello", 150, "+", 11, 13, [3]byte{50, 100, 150}, + 2, []int{40, 60}, []int{100, 200}} + got, err := input.MarshalText() + if err != nil { + t.Fatalf("%v.MarshalText() failed: %v", input, err) + } + if string(got) != want { + t.Fatalf("%v.MarshalText()=%q, want %q", input, got, want) + } + input.N = 6 + want = want[:22] + "\n" + got, err = input.MarshalText() + if err != nil { + t.Fatalf("%v.MarshalText() failed: %v", input, err) + } + if string(got) != want { + t.Fatalf("%v.MarshalText()=%q, want %q", input, got, want) + } +} diff --git a/formats/bed/v2/iter.go b/formats/bed/v2/iter.go new file mode 100644 index 0000000..3087873 --- /dev/null +++ b/formats/bed/v2/iter.go @@ -0,0 +1,40 @@ +package bed + +import ( + "io" + "iter" + + "github.com/fluhus/gostuff/aio" +) + +// Reader returns an iterator over fasta entries in a reader. +func Reader(r io.Reader) iter.Seq2[*BED, error] { + return func(yield func(*BED, error) bool) { + rd := newReader(r) + for { + bed, err := rd.read() + if err == io.EOF { + return + } + if err != nil { + yield(nil, err) + return + } + if !yield(bed, nil) { + return + } + } + } +} + +// File returns an iterator over fasta entries in a file. +func File(file string) iter.Seq2[*BED, error] { + return func(yield func(*BED, error) bool) { + f, err := aio.Open(file) + if err != nil { + yield(nil, err) + return + } + defer f.Close() + } +} diff --git a/formats/fasta/fasta.go b/formats/fasta/fasta.go index b78a135..a61da44 100644 --- a/formats/fasta/fasta.go +++ b/formats/fasta/fasta.go @@ -4,6 +4,8 @@ // https://en.wikipedia.org/wiki/FASTA_format // // This package does not validate sequence characters. +// +// Deprecated: use v2. package fasta import ( diff --git a/formats/fasta/v2/fasta.go b/formats/fasta/v2/fasta.go new file mode 100644 index 0000000..ecbd03d --- /dev/null +++ b/formats/fasta/v2/fasta.go @@ -0,0 +1,140 @@ +// Package fasta decodes and encodes fasta files. +// +// This package uses the format described in: +// https://en.wikipedia.org/wiki/FASTA_format +// +// This package does not validate sequence characters. +package fasta + +import ( + "bufio" + "bytes" + "fmt" + "io" +) + +const ( + textLineLen = 80 +) + +// Fasta is a single sequence in a fasta file. +type Fasta struct { + Name []byte // Sequence name (without the '>') + Sequence []byte // Sequence +} + +// MarshalText returns the textual representation of f in fasta format. Includes a +// trailing new line. Always includes a name line, even for empty names. Sequence +// gets broken down into lines of length 80. +func (f *Fasta) MarshalText() ([]byte, error) { + n := 2 + len(f.Name) + len(f.Sequence) + + (len(f.Sequence)+textLineLen-1)/textLineLen + buf := bytes.NewBuffer(make([]byte, 0, n)) + f.Write(buf) + if buf.Len() != n { + panic(fmt.Sprintf("bad len: %v want %v", buf.Len(), n)) + } + return buf.Bytes(), nil +} + +// Write writes this entry in textual Fasta format to the given writer. +// Includes a trailing new line. +func (f *Fasta) Write(w io.Writer) error { + if _, err := fmt.Fprintf(w, ">%s\n", f.Name); err != nil { + return err + } + for i := 0; i < len(f.Sequence); i += textLineLen { + to := min(i+textLineLen, len(f.Sequence)) + if _, err := fmt.Fprintf(w, "%s\n", f.Sequence[i:to]); err != nil { + return err + } + } + return nil +} + +// A reader reads sequences from a fasta stream. +type reader struct { + r *bufio.Reader +} + +// Returns a new fasta reader that reads from r. +func newReader(r io.Reader) *reader { + return &reader{bufio.NewReader(r)} +} + +// read reads a single fasta sequence from a stream. Returns EOF only if +// nothing was read. +func (r *reader) read() (*Fasta, error) { + // States of the reader. + const ( + stateStart = iota // Beginning of input + stateNewLine // Beginning of new line + stateName // Middle of name + stateSequence // Middle sequence + ) + + // Start reading. + result := &Fasta{} + state := stateStart + var b byte + var err error + readAnything := false + +loop: + for b, err = r.r.ReadByte(); err == nil; b, err = r.r.ReadByte() { + readAnything = true + switch state { + case stateStart: + // '>' marks the name of the sequence. + if b == '>' { + state = stateName + } else { + // If no '>' then only sequence without name. + state = stateSequence + if b == '\n' || b == '\r' { + state = stateNewLine + } else { + result.Sequence = append(result.Sequence, b) + } + } + + case stateSequence: + if b == '\n' || b == '\r' { + state = stateNewLine + } else { + result.Sequence = append(result.Sequence, b) + } + + case stateName: + if b == '\n' || b == '\r' { + state = stateNewLine + } else { + result.Name = append(result.Name, b) + } + + case stateNewLine: + if b == '\n' || b == '\r' { + // Nothing. Move on to the next line. + } else if b == '>' { + // New sequence => done reading. + r.r.UnreadByte() + break loop + } else { + // Just more sequence. + state = stateSequence + result.Sequence = append(result.Sequence, b) + } + } + } + + // Return EOF only if encountered before reading anything. + if !readAnything { + return nil, err + } + // EOF will be returned on the next call to Next. + if err != nil && err != io.EOF { + return nil, err + } + + return result, nil +} diff --git a/formats/fasta/v2/fasta_test.go b/formats/fasta/v2/fasta_test.go new file mode 100644 index 0000000..6f15d20 --- /dev/null +++ b/formats/fasta/v2/fasta_test.go @@ -0,0 +1,106 @@ +package fasta + +import ( + "bytes" + "fmt" + "reflect" + "strings" + "testing" +) + +func TestReader_simple(t *testing.T) { + input := ">foo\nAaTtGnNngcCaN" + want := []*Fasta{{[]byte("foo"), []byte("AaTtGnNngcCaN")}} + var got []*Fasta + for fa, err := range Reader(strings.NewReader(input)) { + if err != nil { + t.Fatalf("Reader(%q) failed: %v", input, err) + } + got = append(got, fa) + } + if !reflect.DeepEqual(want, got) { + t.Fatalf("Reader(%q)=%v, want %v", input, got, want) + } +} + +func TestReader_noName(t *testing.T) { + input := "AaTtGngcCaN" + want := []*Fasta{{nil, []byte("AaTtGngcCaN")}} + var got []*Fasta + for fa, err := range Reader(strings.NewReader(input)) { + if err != nil { + t.Fatalf("Reader(%q) failed: %v", input, err) + } + got = append(got, fa) + } + if !reflect.DeepEqual(want, got) { + t.Fatalf("Reader(%q)=%v, want %v", input, got, want) + } +} + +func TestReader_multiline(t *testing.T) { + input := ">foo\nAaTtGngcCaN\nGGgg\n>foo" + want := []*Fasta{ + {[]byte("foo"), []byte("AaTtGngcCaNGGgg")}, + {[]byte("foo"), nil}, + } + var got []*Fasta + for fa, err := range Reader(strings.NewReader(input)) { + if err != nil { + t.Fatalf("Reader(%q) failed: %v", input, err) + } + got = append(got, fa) + } + if !reflect.DeepEqual(want, got) { + t.Fatalf("Reader(%q)=%v, want %v", input, got, want) + } +} + +func TestReader_multiple(t *testing.T) { + input := ">foo\nAaTtGngcCaN\nGGgg\n>bar\naaaGcgnnNcTAtgGa\nAA\n\nGagaGNtCc" + want := []*Fasta{ + {[]byte("foo"), []byte("AaTtGngcCaNGGgg")}, + {[]byte("bar"), []byte("aaaGcgnnNcTAtgGaAAGagaGNtCc")}, + } + var got []*Fasta + for fa, err := range Reader(strings.NewReader(input)) { + if err != nil { + t.Fatalf("Reader(%q) failed: %v", input, err) + } + got = append(got, fa) + } + if !reflect.DeepEqual(want, got) { + t.Fatalf("Reader(%q)=%v, want %v", input, got, want) + } +} + +func TestMarshalText(t *testing.T) { + tests := []struct { + input *Fasta + want string + }{ + {&Fasta{[]byte("Hello"), []byte("ATGGCC")}, ">Hello\nATGGCC\n"}, + {&Fasta{[]byte("Bye"), nil}, ">Bye\n"}, + {&Fasta{[]byte("Howdy"), bytes.Repeat([]byte("AATTGGCC"), 25)}, + fmt.Sprintf(">Howdy\n%s\n%s\n%s\n", + strings.Repeat("AATTGGCC", 10), + strings.Repeat("AATTGGCC", 10), + strings.Repeat("AATTGGCC", 5), + )}, + } + for _, test := range tests { + if got, err := test.input.MarshalText(); err != nil || + string(got) != test.want { + t.Errorf("%v.Text()=%v,%v, want %v", test.input, got, err, test.want) + } + } +} + +func BenchmarkMarshalText(b *testing.B) { + fa := &Fasta{Name: []byte("bla bla bla bla bla bla"), + Sequence: bytes.Repeat([]byte("a"), 1000)} + b.ResetTimer() + for i := 0; i < b.N; i++ { + fa.MarshalText() + } +} diff --git a/formats/fasta/v2/iter.go b/formats/fasta/v2/iter.go new file mode 100644 index 0000000..e17c0d9 --- /dev/null +++ b/formats/fasta/v2/iter.go @@ -0,0 +1,54 @@ +package fasta + +import ( + "io" + "iter" + + "github.com/fluhus/gostuff/aio" +) + +// Returns an iterator over fasta entries. +func (r *reader) iter() iter.Seq2[*Fasta, error] { + return func(yield func(*Fasta, error) bool) { + for { + fa, err := r.read() + if err != nil { + if err != io.EOF { + yield(nil, err) + } + break + } + if !yield(fa, nil) { + return + } + } + } +} + +// File returns an iterator over fasta entries in a file. +func File(file string) iter.Seq2[*Fasta, error] { + return func(yield func(*Fasta, error) bool) { + f, err := aio.Open(file) + if err != nil { + yield(nil, err) + return + } + defer f.Close() + for fa, err := range Reader(f) { + if !yield(fa, err) { + break + } + } + } +} + +// Reader returns an iterator over fasta entries in a reader. +func Reader(r io.Reader) iter.Seq2[*Fasta, error] { + return func(yield func(*Fasta, error) bool) { + for fa, err := range newReader(r).iter() { + if !yield(fa, err) { + break + } + } + } +} diff --git a/formats/fastq/fastq.go b/formats/fastq/fastq.go index d5539b0..9b5c9d6 100644 --- a/formats/fastq/fastq.go +++ b/formats/fastq/fastq.go @@ -4,6 +4,8 @@ // https://en.wikipedia.org/wiki/FASTQ_format // // This package does not validate sequence and quality characters. +// +// Deprecated: use v2. package fastq import ( diff --git a/formats/fastq/v2/fastq.go b/formats/fastq/v2/fastq.go new file mode 100644 index 0000000..40ba0b3 --- /dev/null +++ b/formats/fastq/v2/fastq.go @@ -0,0 +1,109 @@ +// Package fastq decodes and encodes fastq files. +// +// This package uses the format described in: +// https://en.wikipedia.org/wiki/FASTQ_format +// +// This package does not validate sequence and quality characters. +package fastq + +import ( + "bufio" + "bytes" + "fmt" + "io" + "slices" +) + +// Fastq is a single sequence in a fastq file. +type Fastq struct { + Name []byte // Entry name (without the '@'). + Sequence []byte // Sequence. + Quals []byte // Qualities as ASCII characters. +} + +// MarshalText returns the textual representation of f in fastq format. +// Includes a trailing new line. +func (f *Fastq) MarshalText() ([]byte, error) { + n := 6 + len(f.Name) + len(f.Sequence) + len(f.Quals) + buf := bytes.NewBuffer(make([]byte, 0, n)) + f.Write(buf) + if buf.Len() != n { + panic(fmt.Sprintf("bad length: %v expected %v", buf.Len(), n)) + } + return buf.Bytes(), nil +} + +// Write writes this entry in textual Fastq format to the given writer. +// Includes a trailing new line. +func (f *Fastq) Write(w io.Writer) error { + _, err := fmt.Fprintf(w, "@%s\n%s\n+\n%s\n", f.Name, f.Sequence, f.Quals) + return err +} + +// A reader reads sequences from a fastq stream. +type reader struct { + s *bufio.Scanner +} + +// Returns a new fastq reader that reads from r. +func newReader(r io.Reader) *reader { + return &reader{s: bufio.NewScanner(r)} +} + +// Reads the next fastq entry from the reader. +// Returns a non-nil error if reading fails, or io.EOF if encountered end of +// file. When EOF is returned, no fastq is available. +func (r *reader) read() (*Fastq, error) { + // Read name. + if !r.s.Scan() { + if r.s.Err() == nil { + return nil, io.EOF + } + return nil, fmt.Errorf("fastq read: %v", r.s.Err()) + } + name := slices.Clone(r.s.Bytes()) + + // Handle name. + if len(name) == 0 || name[0] != '@' { + return nil, fmt.Errorf("fastq read: expected '@' at beginning of"+ + " line: %q", string(name)) + } + name = name[1:] + + // Read sequence + if !r.s.Scan() { + if r.s.Err() == nil { + return nil, io.ErrUnexpectedEOF + } + return nil, fmt.Errorf("fastq read: %v", r.s.Err()) + } + seq := slices.Clone(r.s.Bytes()) + + // Read plus + if !r.s.Scan() { + if r.s.Err() == nil { + return nil, io.ErrUnexpectedEOF + } + return nil, fmt.Errorf("fastq read: %v", r.s.Err()) + } + plus := r.s.Bytes() + if !bytes.HasPrefix(plus, []byte("+")) { + return nil, fmt.Errorf("fastq read: expected '+' at beginning of"+ + " line: %q", string(plus)) + } + + // Read qualities + if !r.s.Scan() { + if r.s.Err() == nil { + return nil, io.ErrUnexpectedEOF + } + return nil, fmt.Errorf("fastq read: %v", r.s.Err()) + } + quals := slices.Clone(r.s.Bytes()) + if len(quals) != len(seq) { + return nil, fmt.Errorf("fastq read: sequence and qualities have"+ + " different lengths: %v and %v", len(seq), len(quals)) + } + + return &Fastq{name, seq, quals}, nil +} diff --git a/formats/fastq/v2/fastq_test.go b/formats/fastq/v2/fastq_test.go new file mode 100644 index 0000000..66e24e2 --- /dev/null +++ b/formats/fastq/v2/fastq_test.go @@ -0,0 +1,65 @@ +package fastq + +import ( + "reflect" + "strings" + "testing" +) + +func TestReader_simple(t *testing.T) { + input := "@hello\nAAATTTGG\n+\n!!!@@@!!" + want := []*Fastq{{[]byte("hello"), []byte("AAATTTGG"), []byte("!!!@@@!!")}} + var got []*Fastq + for fq, err := range Reader(strings.NewReader(input)) { + if err != nil { + t.Fatalf("Reader(%q) failed: %v", input, err) + } + got = append(got, fq) + } + if !reflect.DeepEqual(got, want) { + t.Fatalf("Reader(%q)=%v, want %v", input, got, want) + } +} + +func TestReader_bad(t *testing.T) { + inputs := []string{ + "@hello\nAAA\n+", + "@hello\nAAA\n+!!", + "hello\nAAA\n+\n!!!", + "@hello\nAAA\n-\n!!!", + } + for _, input := range inputs { + for got, err := range Reader(strings.NewReader(input)) { + if err == nil { + t.Errorf("Reader(%q)=%v, want fail", input, got) + } + break + } + } +} + +func TestReader_many(t *testing.T) { + input := "@a\nAA\n+\n!!\n@c\nCCC\n+\nKKK" + want := []*Fastq{ + {[]byte("a"), []byte("AA"), []byte("!!")}, + {[]byte("c"), []byte("CCC"), []byte("KKK")}, + } + var got []*Fastq + for fq, err := range Reader(strings.NewReader(input)) { + if err != nil { + t.Fatalf("Reader(%q) failed: %v", input, err) + } + got = append(got, fq) + } + if !reflect.DeepEqual(got, want) { + t.Fatalf("Reader(%q)=%v, want %v", input, got, want) + } +} + +func TestMarshalTextText(t *testing.T) { + input := &Fastq{[]byte("Hello"), []byte("AGAGAG"), []byte("!@##@!")} + want := "@Hello\nAGAGAG\n+\n!@##@!\n" + if got, err := input.MarshalText(); err != nil || string(got) != want { + t.Fatalf("%v.Text()=%v,%v want %v", input, got, err, want) + } +} diff --git a/formats/fastq/v2/iter.go b/formats/fastq/v2/iter.go new file mode 100644 index 0000000..71b475c --- /dev/null +++ b/formats/fastq/v2/iter.go @@ -0,0 +1,54 @@ +package fastq + +import ( + "io" + "iter" + + "github.com/fluhus/gostuff/aio" +) + +// Returns an iterator over fastq entries. +func (r *reader) iter() iter.Seq2[*Fastq, error] { + return func(yield func(*Fastq, error) bool) { + for { + fq, err := r.read() + if err != nil { + if err != io.EOF { + yield(nil, err) + } + break + } + if !yield(fq, nil) { + return + } + } + } +} + +// File returns an iterator over fastq entries in a file. +func File(file string) iter.Seq2[*Fastq, error] { + return func(yield func(*Fastq, error) bool) { + f, err := aio.Open(file) + if err != nil { + yield(nil, err) + return + } + defer f.Close() + for fq, err := range Reader(f) { + if !yield(fq, err) { + break + } + } + } +} + +// Reader returns an iterator over fastq entries in a reader. +func Reader(r io.Reader) iter.Seq2[*Fastq, error] { + return func(yield func(*Fastq, error) bool) { + for fa, err := range newReader(r).iter() { + if !yield(fa, err) { + break + } + } + } +} diff --git a/formats/newick/newick.go b/formats/newick/newick.go index 5488c24..f5918da 100644 --- a/formats/newick/newick.go +++ b/formats/newick/newick.go @@ -2,6 +2,8 @@ // // This package uses the format described in: // https://en.wikipedia.org/wiki/Newick_format +// +// Deprecated: use v2. package newick import ( diff --git a/formats/newick/v2/newick.go b/formats/newick/v2/newick.go new file mode 100644 index 0000000..fdec0e1 --- /dev/null +++ b/formats/newick/v2/newick.go @@ -0,0 +1,279 @@ +// Package newick decodes and encodes Newick-formatted trees. +// +// This package uses the format described in: +// https://en.wikipedia.org/wiki/Newick_format +package newick + +import ( + "bufio" + "bytes" + "fmt" + "io" + "iter" + "strconv" + "strings" + + "github.com/fluhus/gostuff/aio" +) + +// BUG(amit): Comments are currently not supported. + +// A Node is a single node in a tree, along with the subtree that is under it. +type Node struct { + Name string // Node name. May be empty. + Distance float64 // Distance from parent. 0 is treated as none. + Children []*Node // Child nodes. May be nil. +} + +// MarshalText returns a condensed Newick-format representation of this node. +func (n *Node) MarshalText() ([]byte, error) { + buf := bytes.NewBuffer(nil) + n.newick(buf) + buf.WriteByte(';') + return buf.Bytes(), nil +} + +// Write writes a condensed Newick-format representation of this node +// to w. +func (n *Node) Write(w io.Writer) error { + txt, err := n.MarshalText() + if err != nil { + return err + } + _, err = w.Write(txt) + return err +} + +// Writes a single node/subtree to the buffer. +func (n *Node) newick(buf *bytes.Buffer) { + if len(n.Children) > 0 { + buf.WriteByte('(') + for i, c := range n.Children { + if i > 0 { + buf.WriteByte(',') + } + c.newick(buf) + } + buf.WriteByte(')') + } + buf.WriteString(nameToText(n.Name)) + if n.Distance != 0 { + fmt.Fprint(buf, ":", n.Distance) + } +} + +// Reads Newick-formatted trees. +type reader struct { + r *bufio.Reader + b *bytes.Buffer +} + +// Returns a reader that reads from r. +func newReader(r io.Reader) *reader { + return &reader{bufio.NewReader(r), &bytes.Buffer{}} +} + +// Reads a single tree from the input. Can be called multiple times to read +// subsequent trees. +func (r *reader) read() (*Node, error) { + // TODO(amit): Break this down to subfunctions. + + // Possible states. + const ( + beforeNode = iota + afterName + afterColon + afterDist + afterChildren + ) + + stack := []*Node{{}} + state := beforeNode + readAny := false // Accepting EOF only if no tokens are available. + +loop: + for { + token, err := r.nextToken() + if err != nil { + if err == io.EOF && readAny { + return nil, io.ErrUnexpectedEOF + } + return nil, err + } + readAny = true + switch token { + case "(": + if state != beforeNode { + return nil, fmt.Errorf("unexpected '('") + } + cur := stack[len(stack)-1] + node := &Node{} + cur.Children = append(cur.Children, node) + stack = append(stack, node) + case ")": + if state == afterColon { + return nil, fmt.Errorf("unexpected ')'") + } + if len(stack) == 1 { + return nil, fmt.Errorf("too many ')'") + } + stack = stack[:len(stack)-1] + state = afterChildren + case ",": + if state == afterColon { + return nil, fmt.Errorf("unexpected ',' after ':'") + } + if len(stack) == 1 { + return nil, fmt.Errorf("found ',' after top level node") + } + node := &Node{} + parent := stack[len(stack)-2] + parent.Children = append(parent.Children, node) + stack[len(stack)-1] = node + state = beforeNode + case ":": + if state == afterColon || state == afterDist { + return nil, fmt.Errorf("unexpected ':' after ':'") + } + state = afterColon + case ";": + if len(stack) != 1 { + return nil, fmt.Errorf("unexpected ';' at depth %d", len(stack)+1) + } + if state == afterColon { + return nil, fmt.Errorf("unexpected ';' after ':'") + } + break loop + default: + if state == afterName || state == afterDist { + return nil, fmt.Errorf("unexpected token: %q", token) + } + cur := stack[len(stack)-1] + if state == beforeNode || state == afterChildren { + cur.Name = nameFromText(token) + state = afterName + break + } + if state != afterColon { + panic(fmt.Sprintf("unexpected state: %d", state)) + } + dist, err := strconv.ParseFloat(token, 64) + if err != nil { + return nil, err + } + cur.Distance = dist + state = afterDist + } + } + return stack[0], nil +} + +// Reads a single token from the input stream. +func (r *reader) nextToken() (string, error) { + r.b.Reset() + quote := false + afterQuote := false + +loop: + for { + b, err := r.r.ReadByte() + if err != nil { + if err == io.EOF && r.b.Len() > 0 { + break loop + } + return "", err + } + + if quote { + if b == '\'' { + afterQuote = !afterQuote + } else if afterQuote { + // End of quoted string. + r.r.UnreadByte() + break + } + r.b.WriteByte(b) + continue + } + + switch b { + case '\'': + if r.b.Len() > 0 { + return "", fmt.Errorf("unexpected ' after %q", r.b.String()) + } + quote = true + r.b.WriteByte(b) + case '(', ')', ',', ':', ';': + if r.b.Len() > 0 { + r.r.UnreadByte() + break loop + } + return string([]byte{b}), nil + case ' ', '\t', '\n', '\r': + if r.b.Len() > 0 { + break loop + } + default: + r.b.WriteByte(b) + } + } + return r.b.String(), nil +} + +// Converts a possibly quoted name from newick format to regular string. +func nameFromText(s string) string { + if quoted(s) { + return strings.ReplaceAll(s[1:len(s)-1], "''", "'") + } + return strings.ReplaceAll(s, "_", " ") +} + +// Converts a name to newick format. +func nameToText(s string) string { + if strings.ContainsAny(s, "(),:;'_\t") { + return "'" + strings.ReplaceAll(s, "'", "''") + "'" + } + return strings.ReplaceAll(s, " ", "_") +} + +// Checks if a newick-formatted name is in quotes. +func quoted(s string) bool { + return len(s) >= 2 && s[0] == '\'' && s[len(s)-1] == '\'' +} + +// Reader returns an iterator over trees in a reader. +func Reader(r io.Reader) iter.Seq2[*Node, error] { + return func(yield func(*Node, error) bool) { + rd := newReader(r) + for { + n, err := rd.read() + if err == io.EOF { + return + } + if err != nil { + yield(nil, err) + return + } + if !yield(n, nil) { + return + } + } + } +} + +// File returns an iterator over trees in a file. +func File(file string) iter.Seq2[*Node, error] { + return func(yield func(*Node, error) bool) { + f, err := aio.Open(file) + if err != nil { + yield(nil, err) + return + } + defer f.Close() + for n, err := range Reader(f) { + if !yield(n, err) { + return + } + } + } +} diff --git a/formats/newick/v2/newick_test.go b/formats/newick/v2/newick_test.go new file mode 100644 index 0000000..2b7d4be --- /dev/null +++ b/formats/newick/v2/newick_test.go @@ -0,0 +1,199 @@ +package newick + +import ( + "io" + "reflect" + "strings" + "testing" +) + +func TestMarshalText(t *testing.T) { + tests := []struct { + input *Node + want string + }{ + {&Node{"hi", 0, nil}, "hi;"}, + {&Node{"", 0, []*Node{ + {"aaa", 11, nil}, + {"", 22, []*Node{ + {"bb", 23, []*Node{ + {"B", 0, nil}, + }}, + {"bbbb", 25, nil}, + }}, + {"c", 33, nil}, + }}, "(aaa:11,((B)bb:23,bbbb:25):22,c:33);"}, + {&Node{"g(bb)", 0, []*Node{ + {"AAA", 0, nil}, {"A b", 0, nil}, + }}, "(AAA,A_b)'g(bb)';"}, + } + for _, test := range tests { + if got, err := test.input.MarshalText(); err != nil || + string(got) != test.want { + t.Errorf("%v.Newick()=%q,%v, want %q", test.input, got, err, test.want) + } + } +} + +func TestNextToken(t *testing.T) { + tests := []struct { + input string + want []string + }{ + {"(", []string{"("}}, + {"(((", []string{"(", "(", "("}}, + {"( ( (", []string{"(", "(", "("}}, + {"AAAA", []string{"AAAA"}}, + {"AA(BB", []string{"AA", "(", "BB"}}, + {"AA 'BB GG':'abc))''def(:'", + []string{"AA", "'BB GG'", ":", "'abc))''def(:'"}}, + { + "(a:3.14,B,:6)GGG;", []string{ + "(", "a", ":", "3.14", ",", "B", ",", ":", "6", ")", "GGG", ";"}, + }, + { + " ( a:3.14, B, :6) GGG ; ", []string{ + "(", "a", ":", "3.14", ",", "B", ",", ":", "6", ")", "GGG", ";"}, + }, + } + + for _, test := range tests { + var got []string + var err error + var token string + r := newReader(strings.NewReader(test.input)) + for token, err = r.nextToken(); err == nil; token, err = r.nextToken() { + got = append(got, token) + } + if err != io.EOF { + t.Fatalf("Reader(%q).nextToken() failed: %v", + test.input, err) + } + if !reflect.DeepEqual(got, test.want) { + t.Errorf("Reader(%q).nextToken()=%v, want %v", + test.input, got, test.want) + } + } +} + +func TestReader(t *testing.T) { + tests := []struct { + input string + want *Node + }{ + {";", &Node{"", 0, nil}}, + {"();", &Node{"", 0, []*Node{{}}}}, + {":4;", &Node{"", 4, nil}}, + {"AAA;", &Node{"AAA", 0, nil}}, + {"(AAA,A_b)'g(bb)';", &Node{"g(bb)", 0, []*Node{ + {"AAA", 0, nil}, {"A b", 0, nil}, + }}}, + {"AAA:1.23;", &Node{"AAA", 1.23, nil}}, + {"(A,(B,C))D;", &Node{"D", 0, []*Node{ + {"A", 0, nil}, {"", 0, []*Node{{"B", 0, nil}, {"C", 0, nil}}}, + }}}, + {"(A,BB,CCC);", &Node{"", 0, + []*Node{{"A", 0, nil}, {"BB", 0, nil}, {"CCC", 0, nil}}}}, + {" (aaa:11,( ( B)bb: 23, bbbb:25):22, c:33 ) ; ", + &Node{"", 0, []*Node{ + {"aaa", 11, nil}, + {"", 22, []*Node{ + {"bb", 23, []*Node{ + {"B", 0, nil}, + }}, + {"bbbb", 25, nil}, + }}, + {"c", 33, nil}, + }}}, + } + for _, test := range tests { + got, err := newReader(strings.NewReader(test.input)).read() + if err != nil { + t.Fatalf("Read(%q) failed: %v", test.input, err) + } + if !reflect.DeepEqual(got, test.want) { + t.Fatalf("Read(%q)=%v, want %v", test.input, got, test.want) + } + } +} + +func TestReader_multi(t *testing.T) { + input := "AAA;BB:123;:321;" + want := []*Node{ + {"AAA", 0, nil}, + {"BB", 123, nil}, + {"", 321, nil}, + } + var got []*Node + var node *Node + var err error + r := newReader(strings.NewReader(input)) + for node, err = r.read(); err == nil; node, err = r.read() { + got = append(got, node) + } + if err != io.EOF { + t.Fatalf("Read(%q) failed: %v", input, err) + } + if !reflect.DeepEqual(got, want) { + t.Fatalf("Read(%q)=%v, want %v", input, got, want) + } +} + +func TestReader_bad(t *testing.T) { + inputs := []string{ + "AAA", + ");", + "());", + "AAA:;", + "(AAA:);", + "AAA::123;", + "AAA AAA;", + "AAA:123 AAA;", + } + for _, input := range inputs { + got, err := newReader(strings.NewReader(input)).read() + if err == nil { + t.Errorf("Read(%q)=%v, want error", input, got) + } + } +} + +func TestNameFromText(t *testing.T) { + tests := []struct { + input string + want string + }{ + {"A", "A"}, + {"A_b", "A b"}, + {"A_b__c_", "A b c "}, + {"'A'", "A"}, + {"'A_b'", "A_b"}, + {"'A_b '' g(5)'", "A_b ' g(5)"}, + } + for _, test := range tests { + if got := nameFromText(test.input); got != test.want { + t.Errorf("nameFromText(%q)=%q, want %q", + test.input, got, test.want) + } + } +} + +func TestNameToText(t *testing.T) { + tests := []struct { + input string + want string + }{ + {"A", "A"}, + {"A b", "A_b"}, + {"A b c ", "A_b__c_"}, + {"A_b", "'A_b'"}, + {"A'b", "'A''b'"}, + {"A_b ' g(5)", "'A_b '' g(5)'"}, + } + for _, test := range tests { + if got := nameToText(test.input); got != test.want { + t.Errorf("nameToText(%q)=%q, want %q", + test.input, got, test.want) + } + } +} diff --git a/formats/newick/v2/traverse.go b/formats/newick/v2/traverse.go new file mode 100644 index 0000000..200130f --- /dev/null +++ b/formats/newick/v2/traverse.go @@ -0,0 +1,57 @@ +// Tree traversal functionality. + +package newick + +import "iter" + +// PreOrder returns an iterator over nodes in the tree. +// Every node is called before its children. +// +// Does not make recursive calls. +func (n *Node) PreOrder() iter.Seq[*Node] { + return n.traverse(true) +} + +// PostOrder returns an iterator over nodes in the tree. +// Every node is called after its children. +// +// Does not make recursive calls. +func (n *Node) PostOrder() iter.Seq[*Node] { + return n.traverse(false) +} + +// Returns an iterator over nodes in the tree. +// pre determines whether this is pre- or post-order. +func (n *Node) traverse(pre bool) iter.Seq[*Node] { + return func(yield func(*Node) bool) { + stack := []traversalStep{{n, 0}} + for len(stack) > 0 { + stepi := len(stack) - 1 + step := stack[stepi] + + if pre && step.i == 0 { + if !yield(step.n) { + return + } + } + if step.i == len(step.n.Children) { + if !pre { + if !yield(step.n) { + return + } + } + stack = stack[:len(stack)-1] + continue + } + + stack = append(stack, traversalStep{step.n.Children[step.i], 0}) + stack[stepi].i++ + } + } +} + +// A step in the traversal stack. +type traversalStep struct { + n *Node + i int +} diff --git a/formats/newick/v2/traverse_test.go b/formats/newick/v2/traverse_test.go new file mode 100644 index 0000000..40f4870 --- /dev/null +++ b/formats/newick/v2/traverse_test.go @@ -0,0 +1,55 @@ +package newick + +import ( + "reflect" + "strings" + "testing" +) + +func TestPreOrder(t *testing.T) { + tests := []struct { + input string + want []string + }{ + {"A;", []string{"A"}}, + {"(B)A;", []string{"A", "B"}}, + {"((A,B)C,(D,E)F)R;", []string{"R", "C", "A", "B", "F", "D", "E"}}, + } + for _, test := range tests { + tree, err := newReader(strings.NewReader(test.input)).read() + if err != nil { + t.Fatalf("Read(%q) failed: %v", test.input, err) + } + var got []string + for n := range tree.PreOrder() { + got = append(got, n.Name) + } + if !reflect.DeepEqual(got, test.want) { + t.Errorf("%q.PreOrder()=%v, want %v", test.input, got, test.want) + } + } +} + +func TestPostOrder(t *testing.T) { + tests := []struct { + input string + want []string + }{ + {"A;", []string{"A"}}, + {"(B)A;", []string{"B", "A"}}, + {"((A,B)C,(D,E)F)R;", []string{"A", "B", "C", "D", "E", "F", "R"}}, + } + for _, test := range tests { + tree, err := newReader(strings.NewReader(test.input)).read() + if err != nil { + t.Fatalf("Read(%q) failed: %v", test.input, err) + } + var got []string + for n := range tree.PostOrder() { + got = append(got, n.Name) + } + if !reflect.DeepEqual(got, test.want) { + t.Errorf("%q.PostOrder()=%v, want %v", test.input, got, test.want) + } + } +} diff --git a/formats/sam/sam.go b/formats/sam/sam.go index 9549618..d4f603a 100644 --- a/formats/sam/sam.go +++ b/formats/sam/sam.go @@ -2,6 +2,8 @@ // // This package uses the format described in: // https://en.wikipedia.org/wiki/SAM_(file_format) +// +// Deprecated: use v2. package sam import ( diff --git a/formats/sam/v2/flag.go b/formats/sam/v2/flag.go new file mode 100644 index 0000000..f441a91 --- /dev/null +++ b/formats/sam/v2/flag.go @@ -0,0 +1,180 @@ +package sam + +// Bit values of the flag field. +const ( + FlagMultiple Flag = 1 << iota // Template having multiple segments in sequencing + FlagEach // Each segment properly aligned according to the aligner + FlagUnmapped // Segment unmapped + FlagUnmapped2 // Next segment in the template unmapped + FlagReverseComplement // SEQ being reverse complemented + FlagReverseComplement2 // SEQ of the next segment in the template being reverse complemented + FlagFirst // The first segment in the template + FlagLast // The last segment in the template + FlagSecondary // Secondary alignment + FlagNotPassing // Not passing filters, such as platform/vendor quality controls + FlagDuplicate // PCR or optical duplicate + FlagSupplementary // Supplementary alignment +) + +// Flag is the bitwise flag field in a SAM entry. +type Flag int + +// Multiple returns the FlagMultiple bit value of this flag. +func (f Flag) Multiple() bool { + return f&FlagMultiple > 0 +} + +// Each returns the FlagEach bit value of this flag. +func (f Flag) Each() bool { + return f&FlagEach > 0 +} + +// Unmapped returns the FlagUnmapped bit value of this flag. +func (f Flag) Unmapped() bool { + return f&FlagUnmapped > 0 +} + +// Unmapped2 returns the FlagUnmapped2 bit value of this flag. +func (f Flag) Unmapped2() bool { + return f&FlagUnmapped2 > 0 +} + +// ReverseComplement returns the FlagReverseComplement +// bit value of this flag. +func (f Flag) ReverseComplement() bool { + return f&FlagReverseComplement > 0 +} + +// ReverseComplement2 returns the FlagReverseComplement2 +// bit value of this flag. +func (f Flag) ReverseComplement2() bool { + return f&FlagReverseComplement2 > 0 +} + +// First returns the FlagFirst bit value of this flag. +func (f Flag) First() bool { + return f&FlagFirst > 0 +} + +// Last returns the FlagLast bit value of this flag. +func (f Flag) Last() bool { + return f&FlagLast > 0 +} + +// Secondary returns the FlagSecondary bit value of this flag. +func (f Flag) Secondary() bool { + return f&FlagSecondary > 0 +} + +// NotPassing returns the FlagNotPassing bit value of this flag. +func (f Flag) NotPassing() bool { + return f&FlagNotPassing > 0 +} + +// Duplicate returns the FlagDuplicate bit value of this flag. +func (f Flag) Duplicate() bool { + return f&FlagDuplicate > 0 +} + +// Supplementary returns the FlagSupplementary +// bit value of this flag. +func (f Flag) Supplementary() bool { + return f&FlagSupplementary > 0 +} + +// SetMultiple sets the FlagMultiple bit value for this flag. +func (f *Flag) SetMultiple(value bool) { + if value { + *f |= FlagMultiple + } else { + *f &= ^FlagMultiple + } +} + +func (f *Flag) SetEach(value bool) { + if value { + *f |= FlagEach + } else { + *f &= ^FlagEach + } +} + +func (f *Flag) SetUnmapped(value bool) { + if value { + *f |= FlagUnmapped + } else { + *f &= ^FlagUnmapped + } +} + +func (f *Flag) SetUnmapped2(value bool) { + if value { + *f |= FlagUnmapped2 + } else { + *f &= ^FlagUnmapped2 + } +} + +func (f *Flag) SetReverseComplement(value bool) { + if value { + *f |= FlagReverseComplement + } else { + *f &= ^FlagReverseComplement + } +} + +func (f *Flag) SetReverseComplement2(value bool) { + if value { + *f |= FlagReverseComplement2 + } else { + *f &= ^FlagReverseComplement2 + } +} + +func (f *Flag) SetFirst(value bool) { + if value { + *f |= FlagFirst + } else { + *f &= ^FlagFirst + } +} + +func (f *Flag) SetLast(value bool) { + if value { + *f |= FlagLast + } else { + *f &= ^FlagLast + } +} + +func (f *Flag) SetSecondary(value bool) { + if value { + *f |= FlagSecondary + } else { + *f &= ^FlagSecondary + } +} + +func (f *Flag) SetNotPassing(value bool) { + if value { + *f |= FlagNotPassing + } else { + *f &= ^FlagNotPassing + } +} + +func (f *Flag) SetDuplicate(value bool) { + if value { + *f |= FlagDuplicate + } else { + *f &= ^FlagDuplicate + } +} + +func (f *Flag) SetSupplementary(value bool) { + if value { + *f |= FlagSupplementary + } else { + *f &= ^FlagSupplementary + } +} diff --git a/formats/sam/v2/iter.go b/formats/sam/v2/iter.go new file mode 100644 index 0000000..fc0a2d1 --- /dev/null +++ b/formats/sam/v2/iter.go @@ -0,0 +1,54 @@ +package sam + +import ( + "io" + "iter" + + "github.com/fluhus/gostuff/aio" +) + +// Returns an iterator over SAM entries. +func (r *reader) iter() iter.Seq2[*SAM, error] { + return func(yield func(*SAM, error) bool) { + for { + sm, err := r.read() + if err != nil { + if err != io.EOF { + yield(nil, err) + } + break + } + if !yield(sm, nil) { + return + } + } + } +} + +// Reader returns an iterator over SAM entries in a reader. +func Reader(r io.Reader) iter.Seq2[*SAM, error] { + return func(yield func(*SAM, error) bool) { + for sm, err := range newReader(r).iter() { + if !yield(sm, err) { + break + } + } + } +} + +// File returns an iterator over SAM entries in a file. +func File(file string) iter.Seq2[*SAM, error] { + return func(yield func(*SAM, error) bool) { + f, err := aio.Open(file) + if err != nil { + yield(nil, err) + return + } + defer f.Close() + for sm, err := range newReader(f).iter() { + if !yield(sm, err) { + break + } + } + } +} diff --git a/formats/sam/v2/sam.go b/formats/sam/v2/sam.go new file mode 100644 index 0000000..0d1aeba --- /dev/null +++ b/formats/sam/v2/sam.go @@ -0,0 +1,278 @@ +// Package sam decodes and encodes SAM files. +// +// This package uses the format described in: +// https://en.wikipedia.org/wiki/SAM_(file_format) +package sam + +import ( + "bufio" + "bytes" + "encoding/hex" + "fmt" + "io" + "sort" + "strconv" + + "github.com/fluhus/gostuff/csvdec" +) + +// A raw structure for the initial parsing using csvdec. +type samRaw struct { + Qname string + Flag int + Rname string + Pos int + Mapq int + Cigar string + Rnext string + Pnext int + Tlen int + Seq string + Qual string + Extra []string +} + +// SAM is a single line (alignment) in a SAM file. +type SAM struct { + Qname string // Query name + Flag Flag // Bitwise flag + Rname string // Reference sequence name + Pos int // Mapping position (1-based) + Mapq int // Mapping quality + Cigar string // CIGAR string + Rnext string // Ref. name of the mate/next read + Pnext int // Position of the mate/next read + Tlen int // Observed template length + Seq string // Sequence + Qual string // Phred qualities (ASCII) + Tags map[string]interface{} // Typed optional tags. +} + +// MarshalText returns the textual representation of s in SAM format. +// Includes a trailing new line. +func (s *SAM) MarshalText() ([]byte, error) { + buf := bytes.NewBuffer(nil) + s.Write(buf) + return buf.Bytes(), nil +} + +// Write writes this entry in textual SAM format to the given writer. +// Includes a trailing new line. +func (s *SAM) Write(w io.Writer) error { + if _, err := fmt.Fprintf(w, + "%s\t%d\t%s\t%d\t%d\t%s\t%s\t%d\t%d\t%s\t%s", + s.Qname, s.Flag, s.Rname, s.Pos, + s.Mapq, s.Cigar, s.Rnext, s.Pnext, + s.Tlen, s.Seq, s.Qual, + ); err != nil { + return err + } + for _, tag := range tagsToText(s.Tags) { + if _, err := fmt.Fprintf(w, "\t%s", tag); err != nil { + return err + } + } + if _, err := fmt.Fprintf(w, "\n"); err != nil { + return err + } + return nil +} + +// Converts a raw SAM struct to an exported SAM struct. +func fromRaw(raw *samRaw) (*SAM, error) { + result := &SAM{} + result.Qname = raw.Qname + result.Flag = Flag(raw.Flag) + result.Rname = raw.Rname + result.Pos = raw.Pos + result.Mapq = raw.Mapq + result.Cigar = raw.Cigar + result.Rnext = raw.Rnext + result.Pnext = raw.Pnext + result.Tlen = raw.Tlen + result.Seq = raw.Seq + result.Qual = raw.Qual + result.Tags = map[string]interface{}{} + var err error + result.Tags, err = parseTags(raw.Extra) + if err != nil { + return nil, err + } + return result, nil +} + +// A reader reads and parses SAM lines. +type reader struct { + r *bufio.Reader + d *csvdec.Decoder + h bool // Indicates that we are done reading the header. +} + +// newReader returns a new SAM reader that reads from r. +func newReader(r io.Reader) *reader { + var b *bufio.Reader + switch r := r.(type) { + case *bufio.Reader: + b = r + default: + b = bufio.NewReader(r) + } + d := csvdec.NewDecoder(b) + d.Comma = '\t' + d.FieldsPerRecord = -1 // Allow variable number of fields. + d.LazyQuotes = true + return &reader{b, d, false} +} + +// ReadHeader returns the next header line as a raw string, including the '@'. +// Returns EOF when out of header lines, then Read can be called for the +// data lines. +func (r *reader) ReadHeader() (string, error) { + if r.h { + panic("cannot read header after reading alignments.") + } + b, err := r.r.ReadByte() + if err != nil { + if err == io.EOF { + r.h = true + } + return "", err + } + if b != '@' { + r.r.UnreadByte() + r.h = true + return "", io.EOF + } + line, err := r.r.ReadBytes('\n') + if err != nil { + if err == io.EOF { + if len(line) == 0 { + return "", io.ErrUnexpectedEOF + } + } else { + return "", err + } + } + line = line[:len(line)-1] + if len(line) == 0 { + return "", fmt.Errorf("encountered an empty header line") + } + return "@" + string(line), nil +} + +// read returns the next SAM line, skipping any unread header lines. +func (r *reader) read() (*SAM, error) { + for !r.h { + r.ReadHeader() + } + raw := &samRaw{} + err := r.d.Decode(raw) + if err != nil { + return nil, err + } + return fromRaw(raw) +} + +// Returns a map from tag name to its parsed (typed) value. +func parseTags(values []string) (map[string]interface{}, error) { + result := make(map[string]interface{}, len(values)) + for _, f := range values { + parts, err := splitTag(f) + if err != nil { + return nil, err + } + switch parts[1] { + case "A": + if len(parts[2]) != 1 { + return nil, fmt.Errorf("illegal value for tag type %v: %q, "+ + "want a single character", + parts[1], parts[2]) + } + result[parts[0]] = parts[2][0] + case "i": + x, err := strconv.Atoi(parts[2]) + if err != nil { + return nil, fmt.Errorf("illegal value for tag type %v: %q, "+ + "want an integer", + parts[1], parts[2]) + } + result[parts[0]] = x + case "f": + x, err := strconv.ParseFloat(parts[2], 64) + if err != nil { + return nil, fmt.Errorf("illegal value for tag type %v: %q, "+ + "want a number", + parts[1], parts[2]) + } + result[parts[0]] = x + case "Z": + result[parts[0]] = parts[2] + case "H": + x, err := hex.DecodeString(parts[2]) + if err != nil { + return nil, fmt.Errorf("illegal value for tag type %v: %q, "+ + "want a hexadecimal sequence", + parts[1], parts[2]) + } + result[parts[0]] = x + case "B": + // TODO(amit): Not implemented yet. Treating like string for now. + result[parts[0]] = parts[2] + default: + return nil, fmt.Errorf("unrecognized tag type: %v, in tag %v", + parts[1], f) + } + } + return result, nil +} + +// Splits a SAM tag by colon. Used instead of strings.SpliN for performance. +func splitTag(tag string) ([3]string, error) { + colon1, colon2 := -1, -1 + for i, c := range tag { + if c == ':' { + if colon1 == -1 { + colon1 = i + } else { + colon2 = i + break + } + } + } + var result [3]string + if colon2 == -1 { + return result, fmt.Errorf("tag doesn't have at least 3 colons: %q", tag) + } + result[0] = tag[:colon1] + result[1] = tag[colon1+1 : colon2] + result[2] = tag[colon2+1:] + return result, nil +} + +// Returns the given tags in SAM format, sorted and tab-separated. +func tagsToText(tags map[string]interface{}) []string { + texts := make([]string, 0, len(tags)) + for tag, val := range tags { + texts = append(texts, tagToText(tag, val)) + } + sort.Strings(texts) + return texts +} + +// Returns the SAM format representation of the given tag. +func tagToText(tag string, val interface{}) string { + switch val := val.(type) { + case byte: + return tag + ":A:" + string(val) + case int: + return tag + ":i:" + strconv.Itoa(val) + case float64: + return tag + ":f:" + strconv.FormatFloat(val, 'e', -1, 64) + case string: + return tag + ":Z:" + val + case []byte: + return tag + ":H:" + hex.EncodeToString(val) + default: + panic(fmt.Sprintf("unsupported type for value %v", val)) + } +} diff --git a/formats/sam/v2/sam_test.go b/formats/sam/v2/sam_test.go new file mode 100644 index 0000000..108edfe --- /dev/null +++ b/formats/sam/v2/sam_test.go @@ -0,0 +1,154 @@ +package sam + +import ( + "bytes" + "io" + "reflect" + "strings" + "testing" +) + +func TestReader(t *testing.T) { + input := "@a\n@b\n" + + "c\t2\td\t5\t30\t32M\te\t40\t50\tAAAA\tFFFF\n" + + "f\t6\tg\t10\t60\t4D\th\t70\t80\tTCTC\t!!!!\n" + r := newReader(bytes.NewBuffer([]byte(input))) + + h, err := r.ReadHeader() + if err != nil { + t.Fatalf("ReadHeader() failed: %v", err) + } + if h != "@a" { + t.Fatalf("ReadHeader()=%v, want @a", h) + } + + h, err = r.ReadHeader() + if err != nil { + t.Fatalf("ReadHeader() failed: %v", err) + } + if h != "@b" { + t.Fatalf("ReadHeader()=%v, want @b", h) + } + + _, err = r.ReadHeader() + if err != io.EOF { + t.Fatalf("ReadHeader() error=%v, want EOF", err) + } + + want := &SAM{ + "c", 2, "d", 5, 30, "32M", "e", 40, 50, "AAAA", "FFFF", + map[string]interface{}{}, + } + got, err := r.read() + if err != nil { + t.Fatalf("Next() failed: %v", err) + } + if !reflect.DeepEqual(got, want) { + t.Fatalf("Next()=%v, want %v", got, want) + } + + want = &SAM{ + "f", 6, "g", 10, 60, "4D", "h", 70, 80, "TCTC", "!!!!", + map[string]interface{}{}, + } + got, err = r.read() + if err != nil { + t.Fatalf("Next() failed: %v", err) + } + if !reflect.DeepEqual(got, want) { + t.Fatalf("Next()=%v, want %v", got, want) + } + + _, err = r.read() + if err != io.EOF { + t.Fatalf("Next() error=%v, want EOF", err) + } +} + +func TestReader_skipHeader(t *testing.T) { + input := "@a\n@b\n" + + "c\t2\td\t5\t30\t32M\te\t40\t50\tAAAA\tFFFF\n" + + "f\t6\tg\t10\t60\t4D\th\t70\t80\tTCTC\t!!!!\n" + r := newReader(bytes.NewBuffer([]byte(input))) + + want := &SAM{ + "c", 2, "d", 5, 30, "32M", "e", 40, 50, "AAAA", "FFFF", + map[string]interface{}{}, + } + got, err := r.read() + if err != nil { + t.Fatalf("Next() failed: %v", err) + } + if !reflect.DeepEqual(got, want) { + t.Fatalf("Next()=%v, want %v", got, want) + } + + want = &SAM{ + "f", 6, "g", 10, 60, "4D", "h", 70, 80, "TCTC", "!!!!", + map[string]interface{}{}, + } + got, err = r.read() + if err != nil { + t.Fatalf("Next() failed: %v", err) + } + if !reflect.DeepEqual(got, want) { + t.Fatalf("Next()=%v, want %v", got, want) + } + + _, err = r.read() + if err != io.EOF { + t.Fatalf("Next() error=%v, want EOF", err) + } +} + +func TestDecoder_tags(t *testing.T) { + input := "c\t2\td\t5\t30\t32M\te\t40\t50\tAAAA\tFFFF\t" + + "BC:Z:barcode\tAS:i:123\tZF:f:3.1415\tZH:H:1234abcd" + r := newReader(bytes.NewBuffer([]byte(input))) + + want := &SAM{ + "c", 2, "d", 5, 30, "32M", "e", 40, 50, "AAAA", "FFFF", + map[string]interface{}{ + "BC": "barcode", + "AS": 123, + "ZF": 3.1415, + "ZH": []byte{18, 52, 171, 205}, + }, + } + + got, err := r.read() + if err != nil { + t.Fatalf("Next() failed: %v", err) + } + if !reflect.DeepEqual(got, want) { + t.Fatalf("Next()=%v, want %v", got, want) + } +} + +func TestText(t *testing.T) { + input := "GGCGTT\t0\tbvu:BVU_3729\t38\t255\t24M\t*\t0\t0\t" + + "FADFNAKNNKKNLHDCNEYMNNDE\t*AS:i:44\tMD:Z:14G3C5\tNM:i:2\t" + + "ZE:f:1.07e-05\tZF:i:-3\tZI:i:91\tZL:i:116\tZR:i:104\tZS:i:72\n" + r := newReader(strings.NewReader(input)) + sm, err := r.read() + if err != nil { + t.Fatalf("Next() failed: %v", err) + } + if got, err := sm.MarshalText(); err != nil || string(got) != input { + t.Fatalf("%v.Text()=%v,%v want %v", sm, got, err, input) + } +} + +func BenchmarkText(b *testing.B) { + text := "GGCGTT\t0\tbvu:BVU_3729\t38\t255\t24M\t*\t0\t0\t" + + "FADFNAKNNKKNLHDCNEYMNNDE\t*AS:i:44\tMD:Z:14G3C5\tNM:i:2\t" + + "ZE:f:1.07e-05\tZF:i:-3\tZI:i:91\tZL:i:116\tZR:i:104\tZS:i:72\n" + sm, err := newReader(strings.NewReader(text)).read() + if err != nil { + b.Fatal(err) + } + b.ResetTimer() + for i := 0; i < b.N; i++ { + sm.MarshalText() + } +}