-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweb-archiver.go
124 lines (101 loc) · 2.43 KB
/
web-archiver.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
package main
import (
"bytes"
"encoding/xml"
"fmt"
"io"
"net/http"
"net/url"
"os"
"sync"
)
type URL struct {
Loc string `xml:"loc"`
Lastmod string `xml:"lastmod"`
}
type URLSet struct {
XMLName xml.Name `xml:"urlset"`
URLs []URL `xml:"url"`
}
func fetchSitemap(url string) ([]byte, error) {
resp, err := http.Get(url)
if err != nil {
return nil, err
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return nil, fmt.Errorf("error fetching sitemap: %s", resp.Status)
}
return io.ReadAll(resp.Body)
}
func parseSitemap(data []byte) (*URLSet, error) {
var urlset URLSet
err := xml.Unmarshal(data, &urlset)
if err != nil {
return nil, err
}
return &urlset, nil
}
func saveToWebArchive(urlToSave string) error {
apiURL := "https://web.archive.org/save/"
data := url.Values{}
data.Set("url", urlToSave)
data.Set("capture_all", "on")
data.Set("capture_outlinks", "on")
data.Set("capture_screenshot", "on")
req, err := http.NewRequest("POST", apiURL, bytes.NewBufferString(data.Encode()))
if err != nil {
return err
}
req.Header.Set("Content-Type", "application/x-www-form-urlencoded")
req.Header.Set("Accept", "application/json")
req.Header.Set("Authorization", fmt.Sprintf("LOW %s:%s", os.Getenv("WAYBACK_S3_ACCESS_KEY"), os.Getenv("WAYBACK_S3_SECRET_KEY")))
client := &http.Client{}
resp, err := client.Do(req)
if err != nil {
return err
}
defer resp.Body.Close()
bodyResponse, _ := io.ReadAll(resp.Body)
fmt.Println(urlToSave)
fmt.Println("Web Archive Response Status: ", resp.Status)
fmt.Println("Web Archive Response: ", string(bodyResponse))
return nil
}
func exitWithError(message string) {
fmt.Println(message)
os.Exit(1)
}
func main() {
if len(os.Args) < 2 {
exitWithError("A sitemap URL as first argument is required")
}
date := ""
if len(os.Args) == 3 {
date = os.Args[2]
}
url := os.Args[1]
data, err := fetchSitemap(url)
if err != nil {
exitWithError(fmt.Sprintf("Error fetching sitemap: %v", err))
}
urlset, err := parseSitemap(data)
if err != nil {
exitWithError(fmt.Sprintf("Error parsing sitemap: %v", err))
}
var wg sync.WaitGroup
for _, url := range urlset.URLs {
fmt.Println(url)
if url.Lastmod != "" && (date == "" || date > url.Lastmod) {
fmt.Printf("Skipping %q\n", url.Loc)
continue
}
wg.Add(1)
go func(url URL) {
defer wg.Done()
// TODO: Add archive.today saving
saveToWebArchive(url.Loc)
}(url)
}
wg.Wait()
}