forked from CorentinB/warc
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dedupe.go
80 lines (68 loc) · 1.58 KB
/
dedupe.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
package warc
import (
"io"
"net"
"net/http"
"net/url"
"strconv"
"strings"
"time"
)
var CDXHTTPClient = http.Client{
Timeout: 10 * time.Second,
Transport: &http.Transport{
Dial: (&net.Dialer{
Timeout: 5 * time.Second,
}).Dial,
TLSHandshakeTimeout: 5 * time.Second,
},
}
type DedupeOptions struct {
CDXURL string
CDXCookie string
SizeThreshold int
LocalDedupe bool
CDXDedupe bool
}
type revisitRecord struct {
responseUUID string
targetURI string
date string
size int
}
func (d *customDialer) checkLocalRevisit(digest string) revisitRecord {
revisit, exists := d.client.dedupeHashTable.Load(digest)
if exists {
return revisit.(revisitRecord)
}
return revisitRecord{}
}
func checkCDXRevisit(CDXURL string, digest string, targetURI string, cookie string) (revisitRecord, error) {
req, err := http.NewRequest("GET", CDXURL+"/web/timemap/cdx?url="+url.QueryEscape(targetURI)+"&limit=-1", nil)
if err != nil {
return revisitRecord{}, err
}
if cookie != "" {
req.Header.Add("Cookie", cookie)
}
resp, err := CDXHTTPClient.Do(req)
if err != nil {
return revisitRecord{}, err
}
defer resp.Body.Close()
body, err := io.ReadAll(resp.Body)
if err != nil {
return revisitRecord{}, err
}
cdxReply := strings.Fields(string(body))
if len(cdxReply) >= 7 && cdxReply[3] != "warc/revisit" && cdxReply[5] == digest {
recordSize, _ := strconv.Atoi(cdxReply[6])
return revisitRecord{
responseUUID: "",
size: recordSize,
targetURI: cdxReply[2],
date: cdxReply[1],
}, nil
}
return revisitRecord{}, nil
}