Skip to content

Commit

Permalink
Formats v2 initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
fluhus committed Oct 11, 2024
1 parent cc80821 commit e3ca50c
Show file tree
Hide file tree
Showing 22 changed files with 2,202 additions and 1 deletion.
4 changes: 3 additions & 1 deletion formats/bed/bed.go
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,13 @@
// This package uses the format described in:
// https://en.wikipedia.org/wiki/BED_(file_format)
//
// Limitations
// # Limitations
//
// Currently only tab delimiters are supported.
//
// Currently BED headers are not supported.
//
// Deprecated: use v2.
package bed

import (
Expand Down
257 changes: 257 additions & 0 deletions formats/bed/v2/bed.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,257 @@
// Package bed decodes and encodes BED files.
//
// This package uses the format described in:
// https://en.wikipedia.org/wiki/BED_(file_format)
//
// # Limitations
//
// Currently only tab delimiters are supported.
//
// Currently BED headers are not supported.
package bed

import (
"bytes"
"encoding/csv"
"fmt"
"io"
"strconv"
"strings"
)

// Valid values for the strand field.
const (
PlusStrand = "+"
MinusStrand = "-"
NoStrand = "."
)

// BED is a single line in a BED file.
type BED struct {
N int // Number of fields in this entry
Chrom string
ChromStart int // 0-based
ChromEnd int // 0-based exclusive
Name string
Score int
Strand string
ThickStart int
ThickEnd int
ItemRGB [3]byte
BlockCount int
BlockSizes []int // Length should match BlockCount
BlockStarts []int // Length should match BlockCount
}

// Write writer the textual BED format representation of b to w.
// Encodes the first b.N fields, where b.N is between 3 and 12.
// Includes a trailing new line.
func (b *BED) Write(w io.Writer) error {
if b.N < 3 || b.N > 12 {
return fmt.Errorf("bad number of fields: %v, want 3-12", b.N)
}
if _, err := fmt.Fprintf(w, "%v\t%v\t%v",
b.Chrom, b.ChromStart, b.ChromEnd); err != nil {
return err
}
if b.N > 3 {
if _, err := fmt.Fprintf(w, "\t%v", b.Name); err != nil {
return err
}
}
if b.N > 4 {
if _, err := fmt.Fprintf(w, "\t%v", b.Score); err != nil {
return err
}
}
if b.N > 5 {
if _, err := fmt.Fprintf(w, "\t%v", b.Strand); err != nil {
return err
}
}
if b.N > 6 {
if _, err := fmt.Fprintf(w, "\t%v", b.ThickStart); err != nil {
return err
}
}
if b.N > 7 {
if _, err := fmt.Fprintf(w, "\t%v", b.ThickEnd); err != nil {
return err
}
}
if b.N > 8 {
if _, err := fmt.Fprintf(w, "\t%v,%v,%v",
b.ItemRGB[0], b.ItemRGB[1], b.ItemRGB[2]); err != nil {
return err
}
}
if b.N > 9 {
if _, err := fmt.Fprintf(w, "\t%v", b.BlockCount); err != nil {
return err
}
}
if b.N > 10 {
if _, err := fmt.Fprintf(w, "\t"); err != nil {
return err
}
for i, x := range b.BlockSizes {
txt := "%v"
if i > 0 {
txt = ",%v"
}
if _, err := fmt.Fprintf(w, txt, x); err != nil {
return err
}
}
}
if b.N > 11 {
if _, err := fmt.Fprintf(w, "\t"); err != nil {
return err
}
for i, x := range b.BlockStarts {
txt := "%v"
if i > 0 {
txt = ",%v"
}
if _, err := fmt.Fprintf(w, txt, x); err != nil {
return err
}
}
}
if _, err := fmt.Fprintf(w, "\n"); err != nil {
return err
}
return nil
}

// MarshalText returns the textual representation of b in BED format.
// Encodes the first b.N fields, where b.N is between 3 and 12.
// Includes a trailing new line.
func (b *BED) MarshalText() ([]byte, error) {
buf := bytes.NewBuffer(nil)
if err := b.Write(buf); err != nil {
return nil, err
}
return buf.Bytes(), nil
}

// Parses textual fields into a struct. Returns the number of parsed fields.
func parseLine(fields []string) (*BED, error) {
n := len(fields)
if n < 3 || n > 12 {
return nil, fmt.Errorf("bad number of fields: %v, want 3-12", n)
}

// Force 12 fields to make parsing easy.
fields = append(fields, make([]string, 12-n)...)
bed := &BED{N: n}
var err error

// Mandatory fields.
bed.Chrom = fields[0]
if bed.ChromStart, err = strconv.Atoi(fields[1]); err != nil {
return nil, fmt.Errorf("field 2: %v", err)
}
if bed.ChromEnd, err = strconv.Atoi(fields[2]); err != nil {
return nil, fmt.Errorf("field 3: %v", err)
}

// Optional fields.
bed.Name = fields[3]
if fields[4] != "" {
if bed.Score, err = strconv.Atoi(fields[4]); err != nil {
return nil, fmt.Errorf("field 5: %v", err)
}
}
if fields[5] != "" && fields[5] != PlusStrand &&
fields[5] != MinusStrand && fields[5] != NoStrand {
return nil, fmt.Errorf("field 6: bad strand: %q", fields[5])
}
bed.Strand = fields[5]
if fields[6] != "" {
if bed.ThickStart, err = strconv.Atoi(fields[6]); err != nil {
return nil, fmt.Errorf("field 7: %v", err)
}
}
if fields[7] != "" {
if bed.ThickEnd, err = strconv.Atoi(fields[7]); err != nil {
return nil, fmt.Errorf("field 8: %v", err)
}
}
if fields[8] != "" {
rgb := strings.Split(fields[8], ",")
if len(rgb) != 3 {
return nil, fmt.Errorf("field 9: bad RGB value: %q", fields[8])
}
for i := range rgb {
a, err := strconv.ParseUint(rgb[i], 0, 8)
if err != nil {
return nil, fmt.Errorf("field 9: bad RGB value: %q", fields[8])
}
bed.ItemRGB[i] = byte(a)
}
}
if fields[9] != "" {
if bed.BlockCount, err = strconv.Atoi(fields[9]); err != nil {
return nil, fmt.Errorf("field 10: %v", err)
}
}
if fields[10] != "" {
sizes := strings.Split(fields[10], ",")
bed.BlockSizes = make([]int, len(sizes))
for i := range sizes {
bed.BlockSizes[i], err = strconv.Atoi(sizes[i])
if err != nil {
return nil, fmt.Errorf("field 11: %v", err)
}
}
}
if fields[11] != "" {
starts := strings.Split(fields[11], ",")
bed.BlockStarts = make([]int, len(starts))
for i := range starts {
bed.BlockStarts[i], err = strconv.Atoi(starts[i])
if err != nil {
return nil, fmt.Errorf("field 12: %v", err)
}
}
}

if len(bed.BlockSizes) != bed.BlockCount {
return nil, fmt.Errorf("blockSizes has %v values but blockCount is %v",
len(bed.BlockSizes), bed.BlockCount)
}
if len(bed.BlockStarts) != bed.BlockCount {
return nil, fmt.Errorf("blockStarts has %v values but blockCount is %v",
len(bed.BlockStarts), bed.BlockCount)
}

return bed, nil
}

// A reader reads and parses BED lines.
type reader struct {
r *csv.Reader
}

// newReader returns a new BED reader that reads from r.
func newReader(r io.Reader) *reader {
cr := csv.NewReader(r)
cr.Comma = '\t'
cr.Comment = '#'
return &reader{cr}
}

// read returns the next BED line, and n as the number of fields that were found.
// The first n fields will be populated in the result BED, the rest will have zero
// values. n is always between 3 and 12.
//
// For example if n=5, then the populated fields are Chrom, ChromStart, ChromEnd,
// Name and Score.
func (r *reader) read() (b *BED, err error) {
line, err := r.r.Read()
if err != nil {
return nil, err
}
return parseLine(line)
}
110 changes: 110 additions & 0 deletions formats/bed/v2/bed_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,110 @@
package bed

import (
"reflect"
"slices"
"strings"
"testing"
)

func TestParseLine(t *testing.T) {
input := []string{"chr1", "10", "20", "Hello", "150", "+", "11", "13",
"50,100,150", "2", "40,60", "100,200"}
want := &BED{12, "chr1", 10, 20, "Hello", 150, "+", 11, 13, [3]byte{50, 100, 150},
2, []int{40, 60}, []int{100, 200}}

// Full line.
got, err := parseLine(input)
if err != nil {
t.Fatalf("parseLine(%v) failed: %v", input, err)
}
if !reflect.DeepEqual(got, want) {
t.Fatalf("parseLine(%v)=%v want %v", input, got, want)
}

// Partial line.
input = input[:6]
want = &BED{N: 6, Chrom: "chr1", ChromStart: 10, ChromEnd: 20, Name: "Hello",
Score: 150, Strand: "+"}
got, err = parseLine(input)
if err != nil {
t.Fatalf("parseLine(%v) failed: %v", input, err)
}
if !reflect.DeepEqual(got, want) {
t.Fatalf("parseLine(%v)=%v want %v", input, got, want)
}
}

func TestParseLine_bad(t *testing.T) {
input := []string{"chr1", "10", "20", "Hello", "150", "+", "11", "13",
"50,100,150", "2", "40,60", "100,200"}
cp := slices.Clone(input)

// Check good input.
if _, err := parseLine(cp); err != nil {
t.Fatalf("parseLine(%v) failed: %v", cp, err)
}

// Make bad modifications.
if got, err := parseLine(cp[:2]); err == nil {
t.Fatalf("parseLine(%v)=%v want error", cp[:2], got)
}
cp[5] = "t" // Bad strand
if got, err := parseLine(cp); err == nil {
t.Fatalf("parseLine(%v)=%v want error", cp, got)
}
cp = slices.Clone(input)
cp[8] = "100" // Bad colors
if got, err := parseLine(cp); err == nil {
t.Fatalf("parseLine(%v)=%v want error", cp, got)
}
cp = slices.Clone(input)
cp[8] += "0" // Bad colors (overflow)
if got, err := parseLine(cp); err == nil {
t.Fatalf("parseLine(%v)=%v want error", cp, got)
}
cp = slices.Clone(input)
cp[10] += ",200" // Bad block starts
if got, err := parseLine(cp); err == nil {
t.Fatalf("parseLine(%v)=%v want error", cp, got)
}
}

func TestReader(t *testing.T) {
input := "chr1\t10\t20\tHello\t150\t+\t11\t13\t50,100,150\t2\t40,60\t100,200\n"
want := []*BED{{12, "chr1", 10, 20, "Hello", 150, "+", 11, 13, [3]byte{50, 100, 150},
2, []int{40, 60}, []int{100, 200}}}
var got []*BED

for bed, err := range Reader(strings.NewReader(input)) {
if err != nil {
t.Fatalf("Next() failed: %v", err)
}
got = append(got, bed)
}
if !reflect.DeepEqual(got, want) {
t.Errorf("Next()=%v want %v", got, want)
}
}

func TestMarshalText(t *testing.T) {
want := "chr1\t10\t20\tHello\t150\t+\t11\t13\t50,100,150\t2\t40,60\t100,200\n"
input := &BED{12, "chr1", 10, 20, "Hello", 150, "+", 11, 13, [3]byte{50, 100, 150},
2, []int{40, 60}, []int{100, 200}}
got, err := input.MarshalText()
if err != nil {
t.Fatalf("%v.MarshalText() failed: %v", input, err)
}
if string(got) != want {
t.Fatalf("%v.MarshalText()=%q, want %q", input, got, want)
}
input.N = 6
want = want[:22] + "\n"
got, err = input.MarshalText()
if err != nil {
t.Fatalf("%v.MarshalText() failed: %v", input, err)
}
if string(got) != want {
t.Fatalf("%v.MarshalText()=%q, want %q", input, got, want)
}
}
Loading

0 comments on commit e3ca50c

Please sign in to comment.