Skip to content

Commit

Permalink
[preprocessor/wacz] add wacz user agent
Browse files Browse the repository at this point in the history
  • Loading branch information
williamchong committed Jun 13, 2024
1 parent 0849127 commit 0e4f4a2
Show file tree
Hide file tree
Showing 2 changed files with 42 additions and 0 deletions.
1 change: 1 addition & 0 deletions preprocessor/folder/file.go
Original file line number Diff line number Diff line change
Expand Up @@ -102,6 +102,7 @@ func getWaczFileMetadata(filePath string) (map[string]any, error) {
"asset_origin_id": getAssetOriginRoot(filePath),
"media_type": mediaType,
"asset_origin_type": []string{"wacz"},
"crawl_user_agent": metadata.UserAgent,
"wacz": wacz,
}
return waczMetadata, nil
Expand Down
41 changes: 41 additions & 0 deletions preprocessor/wacz/wacz.go
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@ package util

import (
"archive/zip"
"bufio"
"bytes"
"compress/gzip"
"crypto"
"crypto/ecdsa"
"crypto/sha256"
Expand All @@ -14,8 +16,10 @@ import (
"encoding/pem"
"fmt"
"io"
"log"
"math/big"
"slices"
"strings"
"time"

"path/filepath"
Expand Down Expand Up @@ -65,6 +69,7 @@ type waczPackageData struct {
type WaczFileData struct {
DigestData *waczDigestData
PackageData *waczPackageData
UserAgent string
}

// https://github.com/webrecorder/authsign/blob/main/authsign/trusted/roots.yaml
Expand All @@ -79,6 +84,36 @@ var trustedTimestampFingerprints = []string{
"a6379e7cecc05faa3cbf076013d745e327bbbaa38c0b9af22469d4701d18aabc",
}

// findUserAgent finds the user agent string in the data.warc.gz file.
func findUserAgent(fileMap map[string]*zip.File) (string, error) {
if fileMap["archive/data.warc.gz"] == nil {
return "", fmt.Errorf("missing data.warc.gz")
}
file, err := fileMap["archive/data.warc.gz"].Open()
if err != nil {
return "", err
}
defer file.Close()

fz, err := gzip.NewReader(file)
if err != nil {
return "", err
}
defer fz.Close()

scanner := bufio.NewScanner(fz)
for scanner.Scan() {
text := scanner.Text()
if strings.Contains(text, "user-agent: ") {
return strings.TrimPrefix(text, "user-agent: "), nil
}
}
if err := scanner.Err(); err != nil {
return "", err
}
return "", fmt.Errorf("user-agent not found")
}

// verifyFileHashes verifies the hash of files listed in the package data.
func verifyFileHashes(packageData *waczPackageData, fileMap map[string]*zip.File) error {
for _, resource := range packageData.Resources {
Expand Down Expand Up @@ -399,8 +434,14 @@ func ReadAndVerifyWaczMetadata(filePath string) (*WaczFileData, error) {
return nil, fmt.Errorf("signature verification failed")
}

userAgent, err := findUserAgent(fileMap)
if err != nil {
log.Printf("failed to find user agent in data.warc.gz: %v", err)
}

return &WaczFileData{
DigestData: &digestData,
PackageData: &packageData,
UserAgent: userAgent,
}, nil
}

0 comments on commit 0e4f4a2

Please sign in to comment.