-
Notifications
You must be signed in to change notification settings - Fork 7
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Add parser for pcap index * Replace regex with filepath.Base to extract the UUIID
- Loading branch information
1 parent
a8f5d84
commit c49fd6b
Showing
8 changed files
with
286 additions
and
1 deletion.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
package parser | ||
|
||
import ( | ||
"path/filepath" | ||
"strings" | ||
"time" | ||
|
||
"cloud.google.com/go/bigquery" | ||
"cloud.google.com/go/civil" | ||
v2as "github.com/m-lab/annotation-service/api/v2" | ||
"github.com/m-lab/etl/etl" | ||
"github.com/m-lab/etl/metrics" | ||
"github.com/m-lab/etl/row" | ||
"github.com/m-lab/etl/schema" | ||
) | ||
|
||
//===================================================================================== | ||
// PCAP Parser | ||
//===================================================================================== | ||
|
||
const pcapSuffix = ".pcap.gz" | ||
|
||
// PCAPParser parses the PCAP datatype from the packet-headers process. | ||
type PCAPParser struct { | ||
*row.Base | ||
table string | ||
suffix string | ||
} | ||
|
||
// NewPCAPParser returns a new parser for PCAP archives. | ||
func NewPCAPParser(sink row.Sink, table, suffix string, ann v2as.Annotator) etl.Parser { | ||
bufSize := etl.PCAP.BQBufferSize() | ||
if ann == nil { | ||
ann = v2as.GetAnnotator(etl.BatchAnnotatorURL) | ||
} | ||
|
||
return &PCAPParser{ | ||
Base: row.NewBase(table, sink, bufSize, ann), | ||
table: table, | ||
suffix: suffix, | ||
} | ||
|
||
} | ||
|
||
// IsParsable returns the canonical test type and whether to parse data. | ||
func (p *PCAPParser) IsParsable(testName string, data []byte) (string, bool) { | ||
// Files look like (.*).pcap.gz . | ||
if strings.HasSuffix(testName, pcapSuffix) { | ||
return "pcap", true | ||
} | ||
return "", false | ||
} | ||
|
||
// ParseAndInsert decodes the PCAP data and inserts it into BQ. | ||
func (p *PCAPParser) ParseAndInsert(fileMetadata map[string]bigquery.Value, testName string, rawContent []byte) error { | ||
metrics.WorkerState.WithLabelValues(p.TableName(), "pcap").Inc() | ||
defer metrics.WorkerState.WithLabelValues(p.TableName(), "pcap").Dec() | ||
|
||
row := schema.PCAPRow{ | ||
Parser: schema.ParseInfo{ | ||
Version: Version(), | ||
Time: time.Now(), | ||
ArchiveURL: fileMetadata["filename"].(string), | ||
Filename: testName, | ||
GitCommit: GitCommit(), | ||
}, | ||
} | ||
|
||
// NOTE: Civil is not TZ adjusted. It takes the year, month, and date from | ||
// the given timestamp, regardless of the timestamp's timezone. Since we | ||
// run our systems in UTC, all timestamps will be relative to UTC and as | ||
// will these dates. | ||
row.Date = fileMetadata["date"].(civil.Date) | ||
row.ID = p.GetUUID(testName) | ||
|
||
// Insert the row. | ||
if err := p.Put(&row); err != nil { | ||
return err | ||
} | ||
|
||
// Count successful inserts. | ||
metrics.TestCount.WithLabelValues(p.TableName(), "pcap", "ok").Inc() | ||
|
||
return nil | ||
} | ||
|
||
// GetUUID extracts the UUID from the filename. | ||
// For example, for filename 2021/07/22/ndt-4c6fb_1625899199_00000000013A4623.pcap.gz, | ||
// it returns ndt-4c6fb_1625899199_00000000013A4623. | ||
func (p *PCAPParser) GetUUID(filename string) string { | ||
id := filepath.Base(filename) | ||
return strings.TrimSuffix(id, pcapSuffix) | ||
} | ||
|
||
// NB: These functions are also required to complete the etl.Parser interface | ||
// For PCAP, we just forward the calls to the Inserter. | ||
|
||
func (p *PCAPParser) Flush() error { | ||
return p.Base.Flush() | ||
} | ||
|
||
func (p *PCAPParser) TableName() string { | ||
return p.table | ||
} | ||
|
||
func (p *PCAPParser) FullTableName() string { | ||
return p.table + p.suffix | ||
} | ||
|
||
// RowsInBuffer returns the count of rows currently in the buffer. | ||
func (p *PCAPParser) RowsInBuffer() int { | ||
return p.GetStats().Pending | ||
} | ||
|
||
// Committed returns the count of rows successfully committed to BQ. | ||
func (p *PCAPParser) Committed() int { | ||
return p.GetStats().Committed | ||
} | ||
|
||
// Accepted returns the count of all rows received through InsertRow(s). | ||
func (p *PCAPParser) Accepted() int { | ||
return p.GetStats().Total() | ||
} | ||
|
||
// Failed returns the count of all rows that could not be committed. | ||
func (p *PCAPParser) Failed() int { | ||
return p.GetStats().Failed | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
package parser_test | ||
|
||
import ( | ||
"io/ioutil" | ||
"path" | ||
"strings" | ||
"testing" | ||
|
||
"cloud.google.com/go/bigquery" | ||
"cloud.google.com/go/civil" | ||
"github.com/go-test/deep" | ||
"github.com/m-lab/etl/parser" | ||
"github.com/m-lab/etl/schema" | ||
"github.com/m-lab/go/rtx" | ||
) | ||
|
||
const ( | ||
pcapFilename = "ndt-4c6fb_1625899199_000000000121C1A0.pcap.gz" | ||
pcapGCSPath = "gs://archive-measurement-lab/ndt/pcap/2021/07/22/" | ||
) | ||
|
||
func TestPCAPParser_ParseAndInsert(t *testing.T) { | ||
ins := newInMemorySink() | ||
n := parser.NewPCAPParser(ins, "test", "_suffix", &fakeAnnotator{}) | ||
|
||
data, err := ioutil.ReadFile(path.Join("testdata/PCAP/", pcapFilename)) | ||
rtx.Must(err, "failed to load test file") | ||
|
||
date := civil.Date{Year: 2021, Month: 07, Day: 22} | ||
|
||
meta := map[string]bigquery.Value{ | ||
"filename": path.Join(pcapGCSPath, pcapFilename), | ||
"date": date, | ||
} | ||
|
||
if err := n.ParseAndInsert(meta, pcapFilename, data); err != nil { | ||
t.Errorf("PCAPParser.ParseAndInsert() error = %v, wantErr %v", err, true) | ||
} | ||
|
||
if n.Accepted() != 1 { | ||
t.Fatal("Failed to insert snaplog data", ins) | ||
} | ||
n.Flush() | ||
|
||
row := ins.data[0].(*schema.PCAPRow) | ||
|
||
expectedParseInfo := schema.ParseInfo{ | ||
Version: "https://github.com/m-lab/etl/tree/foobar", | ||
Time: row.Parser.Time, | ||
ArchiveURL: path.Join(pcapGCSPath, pcapFilename), | ||
Filename: pcapFilename, | ||
Priority: 0, | ||
GitCommit: "12345678", | ||
} | ||
|
||
expectedPCAPRow := schema.PCAPRow{ | ||
ID: "ndt-4c6fb_1625899199_000000000121C1A0", | ||
Parser: expectedParseInfo, | ||
Date: date, | ||
} | ||
|
||
if diff := deep.Equal(row, &expectedPCAPRow); diff != nil { | ||
t.Errorf("PCAPParser.ParseAndInsert() different row: %s", strings.Join(diff, "\n")) | ||
} | ||
|
||
} | ||
|
||
func TestPCAPParser_IsParsable(t *testing.T) { | ||
tests := []struct { | ||
name string | ||
testName string | ||
want bool | ||
}{ | ||
{ | ||
name: "success-pcap", | ||
testName: pcapFilename, | ||
want: true, | ||
}, | ||
{ | ||
name: "error-bad-extension", | ||
testName: "badfile.badextension", | ||
want: false, | ||
}, | ||
} | ||
|
||
for _, tt := range tests { | ||
t.Run(tt.name, func(t *testing.T) { | ||
data, err := ioutil.ReadFile(path.Join(`testdata/PCAP/`, tt.testName)) | ||
if err != nil { | ||
t.Fatalf(err.Error()) | ||
} | ||
p := &parser.PCAPParser{} | ||
_, got := p.IsParsable(tt.testName, data) | ||
if got != tt.want { | ||
t.Errorf("PCAPParser.IsParsable() got = %v, want %v", got, tt.want) | ||
} | ||
}) | ||
} | ||
} | ||
|
||
func TestPCAPParser_GetUUID(t *testing.T) { | ||
tests := []struct { | ||
name string | ||
filename string | ||
want string | ||
}{ | ||
{ | ||
name: "filename-expected-format", | ||
filename: "2021/07/22/ndt-4c6fb_1625899199_00000000013A4623.pcap.gz", | ||
want: "ndt-4c6fb_1625899199_00000000013A4623", | ||
}, | ||
{ | ||
name: "filename-without-date-prefix", | ||
filename: "ndt-4c6fb_1625899199_00000000013A4623.pcap.gz", | ||
want: "ndt-4c6fb_1625899199_00000000013A4623", | ||
}, | ||
{ | ||
name: "empty-string", | ||
filename: "", | ||
want: ".", | ||
}, | ||
} | ||
|
||
for _, tt := range tests { | ||
t.Run(tt.name, func(t *testing.T) { | ||
p := &parser.PCAPParser{} | ||
got := p.GetUUID(tt.filename) | ||
if got != tt.want { | ||
t.Errorf("PCAPParser.GetUUID() got = %v, want %v", got, tt.want) | ||
} | ||
}) | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
baddata |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters