Skip to content

Commit

Permalink
Update pcap parser processing rate to 1 in 10 archives (#1014)
Browse files Browse the repository at this point in the history
* Update pcap parser processing rate to 1 in 10 archives

* Use etl.DataType to cast type names

* Add DataType under test to active_test path

* Use path.Join() instead of string concatenation

* Change variable name to 'prefix'
  • Loading branch information
cristinaleonr authored Aug 13, 2021
1 parent c7c6ced commit 81507c8
Show file tree
Hide file tree
Showing 5 changed files with 151 additions and 13 deletions.
24 changes: 17 additions & 7 deletions active/active.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,8 @@ import (
"google.golang.org/api/iterator"

"cloud.google.com/go/storage"
"github.com/m-lab/etl-gardener/tracker"
"github.com/m-lab/etl/etl"
"github.com/m-lab/etl/metrics"
"github.com/m-lab/go/cloud/gcs"
"github.com/m-lab/go/logx"
Expand Down Expand Up @@ -103,17 +105,17 @@ type GCSSource struct {
}

// NewGCSSource creates a new source for active processing.
func NewGCSSource(ctx context.Context, label string, fl FileLister, toRunnable func(*storage.ObjectAttrs) Runnable) (*GCSSource, error) {
func NewGCSSource(ctx context.Context, job tracker.Job, fl FileLister, toRunnable func(*storage.ObjectAttrs) Runnable) (*GCSSource, error) {
src := GCSSource{
ctx: WithFail(ctx),
fileLister: fl,
toRunnable: toRunnable,

pendingChan: make(chan Runnable, 0),
label: label,
label: job.Path(),
}

go src.streamToPending(ctx)
go src.streamToPending(ctx, job)

return &src, nil
}
Expand Down Expand Up @@ -162,7 +164,7 @@ func (src *GCSSource) Next(ctx context.Context) (Runnable, error) {
// It fetches the list of files once, then converts files to Runnables until all files are
// handled, or the context is canceled or expires.
// The Runnables are pulled from the queue by Next().
func (src *GCSSource) streamToPending(ctx context.Context) {
func (src *GCSSource) streamToPending(ctx context.Context, job tracker.Job) {
// No matter what else happens, we eventually want to close the pendingChan.
defer close(src.pendingChan)

Expand All @@ -174,6 +176,10 @@ func (src *GCSSource) streamToPending(ctx context.Context) {
return
}

index := 0
dataType := etl.DataType(job.Datatype)
skipCount := dataType.SkipCount()

for _, f := range files {
debug.Println(f)
if f == nil {
Expand All @@ -186,8 +192,12 @@ func (src *GCSSource) streamToPending(ctx context.Context) {
metrics.ActiveErrors.WithLabelValues(src.Label(), "streamToPending").Inc()
break
}
debug.Printf("Adding gs://%s/%s", f.Bucket, f.Name)
// Blocks until consumer reads channel.
src.pendingChan <- src.toRunnable(f)

if index%(skipCount+1) == 0 {
debug.Printf("Adding gs://%s/%s", f.Bucket, f.Name)
// Blocks until consumer reads channel.
src.pendingChan <- src.toRunnable(f)
}
index++
}
}
93 changes: 88 additions & 5 deletions active/active_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ import (
"context"
"log"
"os"
"path"
"sync"
"testing"
"time"
Expand All @@ -15,6 +16,7 @@ import (
"golang.org/x/sync/errgroup"
"google.golang.org/api/iterator"

"github.com/m-lab/etl-gardener/tracker"
"github.com/m-lab/etl/active"
"github.com/m-lab/go/cloud/gcs"
"github.com/m-lab/go/logx"
Expand All @@ -23,6 +25,10 @@ import (
"github.com/m-lab/go/cloudtest/gcsfake"
)

var (
job = tracker.Job{}
)

func init() {
// Always prepend the filename and line number.
log.SetFlags(log.LstdFlags | log.Lshortfile)
Expand Down Expand Up @@ -101,6 +107,31 @@ func standardLister() active.FileLister {
return active.FileListerFunc(bh, "ndt/ndt5/2019/01/01/", nil)
}

func skipFilesListener(dataType string) active.FileLister {
client := gcsfake.GCSClient{}
prefix := path.Join("ndt/", dataType, "/2019/01/01/")
client.AddTestBucket("foobar",
&gcsfake.BucketHandle{
ObjAttrs: []*storage.ObjectAttrs{
{Bucket: "foobar", Name: path.Join(prefix, "obj1"), Updated: time.Now()},
{Bucket: "foobar", Name: path.Join(prefix, "obj2"), Updated: time.Now()},
{Bucket: "foobar", Name: path.Join(prefix, "obj3"), Updated: time.Now()},
{Bucket: "foobar", Name: path.Join(prefix, "obj4"), Updated: time.Now()},
{Bucket: "foobar", Name: path.Join(prefix, "obj5"), Updated: time.Now()},
{Bucket: "foobar", Name: path.Join(prefix, "obj6"), Updated: time.Now()},
{Bucket: "foobar", Name: path.Join(prefix, "obj7"), Updated: time.Now()},
{Bucket: "foobar", Name: path.Join(prefix, "obj8"), Updated: time.Now()},
{Bucket: "foobar", Name: path.Join(prefix, "obj9"), Updated: time.Now()},
{Bucket: "foobar", Name: path.Join(prefix, "obj10"), Updated: time.Now()},
{Bucket: "foobar", Name: path.Join(prefix, "obj11"), Updated: time.Now()},
}})

bh, err := gcs.GetBucket(context.Background(), &client, "foobar")
rtx.Must(err, "GetBucket failed")
return active.FileListerFunc(bh, prefix, nil)

}

func runAll(ctx context.Context, rSrc active.RunnableSource) (*errgroup.Group, error) {
eg := &errgroup.Group{}
for {
Expand All @@ -123,7 +154,7 @@ func runAll(ctx context.Context, rSrc active.RunnableSource) (*errgroup.Group, e
func TestGCSSourceBasic(t *testing.T) {
p := newCounter(t)
ctx := context.Background()
fs, err := active.NewGCSSource(ctx, "test", standardLister(), p.toRunnable)
fs, err := active.NewGCSSource(ctx, job, standardLister(), p.toRunnable)
if err != nil {
t.Fatal(err)
}
Expand All @@ -149,7 +180,7 @@ func TestWithRunFailures(t *testing.T) {
p.addOutcome(os.ErrInvalid)

ctx := context.Background()
fs, err := active.NewGCSSource(ctx, "test", standardLister(), p.toRunnable)
fs, err := active.NewGCSSource(ctx, job, standardLister(), p.toRunnable)
if err != nil {
t.Fatal(err)
}
Expand All @@ -174,7 +205,7 @@ func TestWithRunFailures(t *testing.T) {
func TestExpiredContext(t *testing.T) {
p := newCounter(t)
ctx := context.Background()
fs, err := active.NewGCSSource(ctx, "test", standardLister(), p.toRunnable)
fs, err := active.NewGCSSource(ctx, job, standardLister(), p.toRunnable)
if err != nil {
t.Fatal(err)
}
Expand All @@ -199,7 +230,7 @@ func TestWithStorageError(t *testing.T) {
p := newCounter(t)

ctx := context.Background()
fs, err := active.NewGCSSource(ctx, "test", ErroringLister, p.toRunnable)
fs, err := active.NewGCSSource(ctx, job, ErroringLister, p.toRunnable)
if err != nil {
t.Fatal(err)
}
Expand All @@ -214,7 +245,7 @@ func TestExpiredFileListerContext(t *testing.T) {
p := newCounter(t)

ctx := context.Background()
fs, err := active.NewGCSSource(ctx, "test", standardLister(), p.toRunnable)
fs, err := active.NewGCSSource(ctx, job, standardLister(), p.toRunnable)
if err != nil {
t.Fatal(err)
}
Expand All @@ -237,3 +268,55 @@ func TestExpiredFileListerContext(t *testing.T) {
t.Error("Should return os.ErrInvalid", err)
}
}

func TestSkipFiles(t *testing.T) {
tests := []struct {
name string
successCount int
failureCount int
}{
{
name: "pcap",
successCount: 2,
failureCount: 0,
},
{
name: "ndt7",
successCount: 11,
failureCount: 0,
},
{
name: "foo",
successCount: 11,
failureCount: 0,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
p := newCounter(t)
ctx := context.Background()
fs, err := active.NewGCSSource(ctx, tracker.Job{Datatype: tt.name}, skipFilesListener(tt.name), p.toRunnable)
if err != nil {
t.Fatal(err)
}

eg, err := runAll(ctx, fs)
if err != iterator.Done {
t.Fatal(err)
}
err = eg.Wait()
if err != nil {
t.Error(err)
}

if p.success != tt.successCount {
t.Errorf("for %s, %d should have succeeded, got %d", tt.name, tt.successCount, p.success)
}

if p.fail != tt.failureCount {
t.Errorf("for %s, %d should have failed, got %d", tt.name, tt.failureCount, p.fail)
}
})
}
}
2 changes: 1 addition & 1 deletion active/poller.go
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ func (g *GardenerAPI) JobFileSource(ctx context.Context, job tracker.Job,
return nil, err
}
lister := FileListerFunc(bh, prefix, filter)
gcsSource, err := NewGCSSource(ctx, job.Path(), lister, toRunnable)
gcsSource, err := NewGCSSource(ctx, job, lister, toRunnable)
if err != nil {
failMetric(job, "GCSSource")
return nil, err
Expand Down
12 changes: 12 additions & 0 deletions etl/globals.go
Original file line number Diff line number Diff line change
Expand Up @@ -318,6 +318,13 @@ var (
}
// There is also a mapping of data types to queue names in
// queue_pusher.go

// Map from data type to number of files to skip when processing said type.
// It allows us process fewer archives when there is a very high volume of data.
// TODO - this should be loaded from a config.
dataTypeToSkipCount = map[DataType]int{
PCAP: 9,
}
)

/*******************************************************************************
Expand All @@ -331,6 +338,11 @@ func DirToTablename(dir string) string {
return dataTypeToTable[dirToDataType[dir]]
}

// SkipCount returns the number of files to skip when processing each DataType.
func (dt DataType) SkipCount() int {
return dataTypeToSkipCount[dt]
}

// BigqueryProject returns the appropriate project.
func (dt DataType) BigqueryProject() string {
project := BigqueryProject
Expand Down
33 changes: 33 additions & 0 deletions etl/globals_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -368,3 +368,36 @@ func TestDirToTablename(t *testing.T) {
t.Errorf("DirToTablename() failed to translate PT dir name correctly.")
}
}

func TestSkipCount(t *testing.T) {
tests := []struct {
name string
dataType etl.DataType
want int
}{
{
name: "ndt7",
dataType: etl.NDT7,
want: 0,
},
{
name: "pcap",
dataType: etl.PCAP,
want: 9,
},
{
name: "invalid",
dataType: etl.INVALID,
want: 0,
},
}

for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
got := tt.dataType.SkipCount()
if got != tt.want {
t.Errorf("SkipCount() = %d, want %d", got, tt.want)
}
})
}
}

0 comments on commit 81507c8

Please sign in to comment.