Skip to content

Commit

Permalink
[preprocessor/wacz] rewrite user agent detection to support any warc …
Browse files Browse the repository at this point in the history
…file
  • Loading branch information
williamchong committed Jun 16, 2024
1 parent 0e4f4a2 commit b09bec5
Showing 1 changed file with 34 additions and 19 deletions.
53 changes: 34 additions & 19 deletions preprocessor/wacz/wacz.go
Original file line number Diff line number Diff line change
Expand Up @@ -85,31 +85,47 @@ var trustedTimestampFingerprints = []string{
}

// findUserAgent finds the user agent string in the data.warc.gz file.
func findUserAgent(fileMap map[string]*zip.File) (string, error) {
if fileMap["archive/data.warc.gz"] == nil {
return "", fmt.Errorf("missing data.warc.gz")
func findUserAgent(packageData waczPackageData, fileMap map[string]*zip.File) (string, error) {
var targetFile string
for _, resource := range packageData.Resources {
if strings.HasPrefix(resource.Path, "archive/") && (strings.HasSuffix(resource.Path, ".warc") || strings.HasSuffix(resource.Path, ".warc.gz")) {
targetFile = resource.Path
break
}
}
file, err := fileMap["archive/data.warc.gz"].Open()
if err != nil {
return "", err

if targetFile == "" || fileMap[targetFile] == nil {
return "", fmt.Errorf("missing warc files")
}
defer file.Close()

fz, err := gzip.NewReader(file)
file, err := fileMap[targetFile].Open()
if err != nil {
return "", err
}
defer fz.Close()
defer file.Close()

scanner := bufio.NewScanner(fz)
for scanner.Scan() {
text := scanner.Text()
if strings.Contains(text, "user-agent: ") {
return strings.TrimPrefix(text, "user-agent: "), nil
if strings.HasSuffix(targetFile, ".gz") {
file, err = gzip.NewReader(file)
if err != nil {
return "", err
}
defer file.Close()
}
if err := scanner.Err(); err != nil {
return "", err

reader := bufio.NewReader(file)
for {
line, err := reader.ReadString('\n')
if err != nil {
if err == io.EOF {
break
}
if err != bufio.ErrBufferFull {
return "", err
}
}
if strings.Contains(line, "user-agent: ") || strings.Contains(line, "User-Agent: ") {
return line[strings.Index(line, ":")+2:], nil
}
}
return "", fmt.Errorf("user-agent not found")
}
Expand Down Expand Up @@ -291,13 +307,12 @@ func verifyDomainSignature(
return false, err
}

if timestampCert.NotBefore.Before(signatureCreated) || timestampCert.NotAfter.After(signatureCreated) {
if signatureCreated.Before(timestampCert.NotBefore) || signatureCreated.After(timestampCert.NotAfter) {
return false, fmt.Errorf("timestamp cert not valid at creation time")
}

if signatureCreated.Sub(*signTime).Abs() > 10*time.Minute {
return false, fmt.Errorf("timestamp too far from signature creation time")

}

return true, nil
Expand Down Expand Up @@ -434,7 +449,7 @@ func ReadAndVerifyWaczMetadata(filePath string) (*WaczFileData, error) {
return nil, fmt.Errorf("signature verification failed")
}

userAgent, err := findUserAgent(fileMap)
userAgent, err := findUserAgent(packageData, fileMap)
if err != nil {
log.Printf("failed to find user agent in data.warc.gz: %v", err)
}
Expand Down

0 comments on commit b09bec5

Please sign in to comment.