Update pcap parser processing rate to 1 in 10 archives (#1014)

* Update pcap parser processing rate to 1 in 10 archives * Use etl.DataType to cast type names * Add DataType under test to active_test path * Use path.Join() instead of string concatenation * Change variable name to 'prefix'
m-lab · Aug 13, 2021 · 81507c8 · 81507c8
1 parent c7c6ced
commit 81507c8
Show file tree

Hide file tree

Showing 5 changed files with 151 additions and 13 deletions.
diff --git a/active/active.go b/active/active.go
@@ -20,6 +20,8 @@ import (
 	"google.golang.org/api/iterator"
 
 	"cloud.google.com/go/storage"
+	"github.com/m-lab/etl-gardener/tracker"
+	"github.com/m-lab/etl/etl"
 	"github.com/m-lab/etl/metrics"
 	"github.com/m-lab/go/cloud/gcs"
 	"github.com/m-lab/go/logx"
@@ -103,17 +105,17 @@ type GCSSource struct {
 }
 
 // NewGCSSource creates a new source for active processing.
-func NewGCSSource(ctx context.Context, label string, fl FileLister, toRunnable func(*storage.ObjectAttrs) Runnable) (*GCSSource, error) {
+func NewGCSSource(ctx context.Context, job tracker.Job, fl FileLister, toRunnable func(*storage.ObjectAttrs) Runnable) (*GCSSource, error) {
 	src := GCSSource{
 		ctx:        WithFail(ctx),
 		fileLister: fl,
 		toRunnable: toRunnable,
 
 		pendingChan: make(chan Runnable, 0),
-		label:       label,
+		label:       job.Path(),
 	}
 
-	go src.streamToPending(ctx)
+	go src.streamToPending(ctx, job)
 
 	return &src, nil
 }
@@ -162,7 +164,7 @@ func (src *GCSSource) Next(ctx context.Context) (Runnable, error) {
 // It fetches the list of files once, then converts files to Runnables until all files are
 // handled, or the context is canceled or expires.
 // The Runnables are pulled from the queue by Next().
-func (src *GCSSource) streamToPending(ctx context.Context) {
+func (src *GCSSource) streamToPending(ctx context.Context, job tracker.Job) {
 	// No matter what else happens, we eventually want to close the pendingChan.
 	defer close(src.pendingChan)
 
@@ -174,6 +176,10 @@ func (src *GCSSource) streamToPending(ctx context.Context) {
 		return
 	}
 
+	index := 0
+	dataType := etl.DataType(job.Datatype)
+	skipCount := dataType.SkipCount()
+
 	for _, f := range files {
 		debug.Println(f)
 		if f == nil {
@@ -186,8 +192,12 @@ func (src *GCSSource) streamToPending(ctx context.Context) {
 			metrics.ActiveErrors.WithLabelValues(src.Label(), "streamToPending").Inc()
 			break
 		}
-		debug.Printf("Adding gs://%s/%s", f.Bucket, f.Name)
-		// Blocks until consumer reads channel.
-		src.pendingChan <- src.toRunnable(f)
+
+		if index%(skipCount+1) == 0 {
+			debug.Printf("Adding gs://%s/%s", f.Bucket, f.Name)
+			// Blocks until consumer reads channel.
+			src.pendingChan <- src.toRunnable(f)
+		}
+		index++
 	}
 }
diff --git a/active/active_test.go b/active/active_test.go
@@ -6,6 +6,7 @@ import (
 	"context"
 	"log"
 	"os"
+	"path"
 	"sync"
 	"testing"
 	"time"
@@ -15,6 +16,7 @@ import (
 	"golang.org/x/sync/errgroup"
 	"google.golang.org/api/iterator"
 
+	"github.com/m-lab/etl-gardener/tracker"
 	"github.com/m-lab/etl/active"
 	"github.com/m-lab/go/cloud/gcs"
 	"github.com/m-lab/go/logx"
@@ -23,6 +25,10 @@ import (
 	"github.com/m-lab/go/cloudtest/gcsfake"
 )
 
+var (
+	job = tracker.Job{}
+)
+
 func init() {
 	// Always prepend the filename and line number.
 	log.SetFlags(log.LstdFlags | log.Lshortfile)
@@ -101,6 +107,31 @@ func standardLister() active.FileLister {
 	return active.FileListerFunc(bh, "ndt/ndt5/2019/01/01/", nil)
 }
 
+func skipFilesListener(dataType string) active.FileLister {
+	client := gcsfake.GCSClient{}
+	prefix := path.Join("ndt/", dataType, "/2019/01/01/")
+	client.AddTestBucket("foobar",
+		&gcsfake.BucketHandle{
+			ObjAttrs: []*storage.ObjectAttrs{
+				{Bucket: "foobar", Name: path.Join(prefix, "obj1"), Updated: time.Now()},
+				{Bucket: "foobar", Name: path.Join(prefix, "obj2"), Updated: time.Now()},
+				{Bucket: "foobar", Name: path.Join(prefix, "obj3"), Updated: time.Now()},
+				{Bucket: "foobar", Name: path.Join(prefix, "obj4"), Updated: time.Now()},
+				{Bucket: "foobar", Name: path.Join(prefix, "obj5"), Updated: time.Now()},
+				{Bucket: "foobar", Name: path.Join(prefix, "obj6"), Updated: time.Now()},
+				{Bucket: "foobar", Name: path.Join(prefix, "obj7"), Updated: time.Now()},
+				{Bucket: "foobar", Name: path.Join(prefix, "obj8"), Updated: time.Now()},
+				{Bucket: "foobar", Name: path.Join(prefix, "obj9"), Updated: time.Now()},
+				{Bucket: "foobar", Name: path.Join(prefix, "obj10"), Updated: time.Now()},
+				{Bucket: "foobar", Name: path.Join(prefix, "obj11"), Updated: time.Now()},
+			}})
+
+	bh, err := gcs.GetBucket(context.Background(), &client, "foobar")
+	rtx.Must(err, "GetBucket failed")
+	return active.FileListerFunc(bh, prefix, nil)
+
+}
+
 func runAll(ctx context.Context, rSrc active.RunnableSource) (*errgroup.Group, error) {
 	eg := &errgroup.Group{}
 	for {
@@ -123,7 +154,7 @@ func runAll(ctx context.Context, rSrc active.RunnableSource) (*errgroup.Group, e
 func TestGCSSourceBasic(t *testing.T) {
 	p := newCounter(t)
 	ctx := context.Background()
-	fs, err := active.NewGCSSource(ctx, "test", standardLister(), p.toRunnable)
+	fs, err := active.NewGCSSource(ctx, job, standardLister(), p.toRunnable)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -149,7 +180,7 @@ func TestWithRunFailures(t *testing.T) {
 	p.addOutcome(os.ErrInvalid)
 
 	ctx := context.Background()
-	fs, err := active.NewGCSSource(ctx, "test", standardLister(), p.toRunnable)
+	fs, err := active.NewGCSSource(ctx, job, standardLister(), p.toRunnable)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -174,7 +205,7 @@ func TestWithRunFailures(t *testing.T) {
 func TestExpiredContext(t *testing.T) {
 	p := newCounter(t)
 	ctx := context.Background()
-	fs, err := active.NewGCSSource(ctx, "test", standardLister(), p.toRunnable)
+	fs, err := active.NewGCSSource(ctx, job, standardLister(), p.toRunnable)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -199,7 +230,7 @@ func TestWithStorageError(t *testing.T) {
 	p := newCounter(t)
 
 	ctx := context.Background()
-	fs, err := active.NewGCSSource(ctx, "test", ErroringLister, p.toRunnable)
+	fs, err := active.NewGCSSource(ctx, job, ErroringLister, p.toRunnable)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -214,7 +245,7 @@ func TestExpiredFileListerContext(t *testing.T) {
 	p := newCounter(t)
 
 	ctx := context.Background()
-	fs, err := active.NewGCSSource(ctx, "test", standardLister(), p.toRunnable)
+	fs, err := active.NewGCSSource(ctx, job, standardLister(), p.toRunnable)
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -237,3 +268,55 @@ func TestExpiredFileListerContext(t *testing.T) {
 		t.Error("Should return os.ErrInvalid", err)
 	}
 }
+
+func TestSkipFiles(t *testing.T) {
+	tests := []struct {
+		name         string
+		successCount int
+		failureCount int
+	}{
+		{
+			name:         "pcap",
+			successCount: 2,
+			failureCount: 0,
+		},
+		{
+			name:         "ndt7",
+			successCount: 11,
+			failureCount: 0,
+		},
+		{
+			name:         "foo",
+			successCount: 11,
+			failureCount: 0,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			p := newCounter(t)
+			ctx := context.Background()
+			fs, err := active.NewGCSSource(ctx, tracker.Job{Datatype: tt.name}, skipFilesListener(tt.name), p.toRunnable)
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			eg, err := runAll(ctx, fs)
+			if err != iterator.Done {
+				t.Fatal(err)
+			}
+			err = eg.Wait()
+			if err != nil {
+				t.Error(err)
+			}
+
+			if p.success != tt.successCount {
+				t.Errorf("for %s, %d should have succeeded, got %d", tt.name, tt.successCount, p.success)
+			}
+
+			if p.fail != tt.failureCount {
+				t.Errorf("for %s, %d should have failed, got %d", tt.name, tt.failureCount, p.fail)
+			}
+		})
+	}
+}
diff --git a/active/poller.go b/active/poller.go
@@ -154,7 +154,7 @@ func (g *GardenerAPI) JobFileSource(ctx context.Context, job tracker.Job,
 		return nil, err
 	}
 	lister := FileListerFunc(bh, prefix, filter)
-	gcsSource, err := NewGCSSource(ctx, job.Path(), lister, toRunnable)
+	gcsSource, err := NewGCSSource(ctx, job, lister, toRunnable)
 	if err != nil {
 		failMetric(job, "GCSSource")
 		return nil, err

diff --git a/etl/globals.go b/etl/globals.go
@@ -318,6 +318,13 @@ var (
 	}
 	// There is also a mapping of data types to queue names in
 	// queue_pusher.go
+
+	// Map from data type to number of files to skip when processing said type.
+	// It allows us process fewer archives when there is a very high volume of data.
+	// TODO - this should be loaded from a config.
+	dataTypeToSkipCount = map[DataType]int{
+		PCAP: 9,
+	}
 )
 
 /*******************************************************************************
@@ -331,6 +338,11 @@ func DirToTablename(dir string) string {
 	return dataTypeToTable[dirToDataType[dir]]
 }
 
+// SkipCount returns the number of files to skip when processing each DataType.
+func (dt DataType) SkipCount() int {
+	return dataTypeToSkipCount[dt]
+}
+
 // BigqueryProject returns the appropriate project.
 func (dt DataType) BigqueryProject() string {
 	project := BigqueryProject

diff --git a/etl/globals_test.go b/etl/globals_test.go
@@ -368,3 +368,36 @@ func TestDirToTablename(t *testing.T) {
 		t.Errorf("DirToTablename() failed to translate PT dir name correctly.")
 	}
 }
+
+func TestSkipCount(t *testing.T) {
+	tests := []struct {
+		name     string
+		dataType etl.DataType
+		want     int
+	}{
+		{
+			name:     "ndt7",
+			dataType: etl.NDT7,
+			want:     0,
+		},
+		{
+			name:     "pcap",
+			dataType: etl.PCAP,
+			want:     9,
+		},
+		{
+			name:     "invalid",
+			dataType: etl.INVALID,
+			want:     0,
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			got := tt.dataType.SkipCount()
+			if got != tt.want {
+				t.Errorf("SkipCount() = %d, want %d", got, tt.want)
+			}
+		})
+	}
+}