-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathaudubon.go
135 lines (120 loc) · 4.47 KB
/
audubon.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
package bird_data_guessing
import (
"fmt"
"regexp"
"strings"
"sync"
"github.com/gbdubs/amass"
"github.com/gbdubs/attributions"
"github.com/gbdubs/bird"
"github.com/gbdubs/sitemaps"
)
type audubonResponse struct {
Response amass.GetResponse
}
const (
audubonSite = "audubon"
maxAudubonConcurrentRequests = 3
)
var audubonSiteMap *sitemaps.Sitemap = nil
var audubonSiteMapLock = sync.RWMutex{}
func audubonSitemap() *sitemaps.Sitemap {
if audubonSiteMap == nil {
audubonSiteMapLock.Lock()
if audubonSiteMap == nil {
s, err := sitemaps.GetPagedSitemapFromURL("https://www.audubon.org/sitemap.xml")
if err != nil {
panic(err)
}
audubonSiteMap = s
}
audubonSiteMapLock.Unlock()
}
return audubonSiteMap
}
func createAudubonRequests(birdName bird.BirdName) []*amass.GetRequest {
if isMissing(audubonSite, birdName) {
return []*amass.GetRequest{}
}
nameParam := strings.ReplaceAll(strings.ReplaceAll(birdName.English, " ", "-"), "'", "")
// For whatever reason, audubon's sitemap references .ngo links, but these 400 when you
// actually request them. However, the same URL pattern DOES work when you use the .org TLD
ngoTargetURL := "https://audubon.ngo/field-guide/bird/" + nameParam
ngoURL, levDist := audubonSitemap().BestFuzzyMatch(ngoTargetURL)
if levDist > 2 {
recordMissing(rspbSite, birdName)
return []*amass.GetRequest{}
}
requestKey := regexp.MustCompile("audubon.ngo/field-guide/bird/([^/]+)").FindStringSubmatch(ngoURL)[1]
actualURL := "https://audubon.org/field-guide/bird/" + requestKey
req := &amass.GetRequest{
Site: audubonSite,
RequestKey: requestKey,
URL: actualURL,
SiteMaxConcurrentRequests: maxAudubonConcurrentRequests,
Attribution: attributions.Attribution{
Author: "National Audubon Society, Inc.",
AuthorUrl: "https://audubon.org",
License: "All rights reserved",
LicenseUrl: "https://www.audubon.org/terms-use",
ScrapingMethodology: "github.com/gbdubs/bird_data_guessing/audubon",
CreatedAt: audubonSitemap().LastUpdated[ngoURL],
},
}
req.SetRoundTripData(birdName)
return []*amass.GetRequest{req}
}
func reconstructAudubonResponsesKeyedByEnglishName(responses []*amass.GetResponse) map[string]*audubonResponse {
result := make(map[string]*audubonResponse)
for _, response := range responses {
if response.Site != audubonSite {
continue
}
birdName := &bird.BirdName{}
response.GetRoundTripData(birdName)
if isAudubonResponseMissing(response) {
recordMissing(audubonSite, *birdName)
continue
}
result[birdName.English] = &audubonResponse{
Response: *response,
}
}
return result
}
func audubonRequestForTesting(englishName string) *audubonResponse {
bn := bird.BirdName{English: englishName}
rs := createAudubonRequests(bn)
if len(rs) != 1 {
panic(fmt.Errorf("Expected 1 audubon request, was %d, for key %s.", len(rs), englishName))
}
resp, err := rs[0].Get()
if err != nil {
panic(fmt.Errorf("Get request failed for %s: %v", englishName, err))
}
m := reconstructAudubonResponsesKeyedByEnglishName([]*amass.GetResponse{resp})
return m[englishName]
}
func isAudubonResponseMissing(r *amass.GetResponse) bool {
return strings.Contains(r.AsDocument().Find("title").Text(), "Sorry, We Couldn't Find That Page")
}
func (r *audubonResponse) propertySearchers() *propertySearchers {
page := r.Response.AsDocument()
dietText := page.Find("h2:contains('Diet')").First().Next().Text()
feedingText := page.Find("h2:contains('Feeding')").First().Next().Text()
eggsText := "EggsAudubonEggs " + page.Find("h2:contains('Eggs')").First().Next().Text()
nestingText := page.Find("h2:contains('Nesting')").First().Next().Text()
habitatText := page.Find("th:contains('Habitat')").First().Parent().Find("td").First().Text()
allText := page.Find("body").Text()
return &propertySearchers{
// Wingspan is omitted, it isn't consistently helpful.
clutchSize: attributedSearch(&r.Response.Attribution, eggsText+nestingText),
eggColor: attributedSearch(&r.Response.Attribution, eggsText),
// Fun fact is omitted, it's not reliably fun.
food: attributedSearch(&r.Response.Attribution, dietText+feedingText),
nestType: attributedSearch(&r.Response.Attribution, nestingText),
habitat: attributedSearch(&r.Response.Attribution, habitatText+nestingText),
predator: attributedSearch(&r.Response.Attribution, allText),
flocking: attributedSearch(&r.Response.Attribution, allText),
}
}