diff --git a/README.md b/README.md index c1fa9b7..3df75ef 100644 --- a/README.md +++ b/README.md @@ -17,5 +17,7 @@ Configuration is handled through environment variables as listed below: - Example: `HEADERS=Rate-Limit-Token:ABC123,X-Header:X-Value` - CONCURRENCY: Controls the number of concurrent requests, useful for controlling request rate. - Example: `CONCURRENCY=10` +- URL_RULES: A comma-separated list of regex patterns matching URLs that the crawler should crawl. All other URLs will be avoided. + - Example: `URL_RULES=https://www.gov.uk/.*` - DISALLOWED_URL_RULES: A comma-separated list of regex patterns matching URLs that the crawler should avoid. - Example: `DISALLOWED_URL_RULES=/search/.*,/government/.*\.atom` diff --git a/internal/client/client.go b/internal/client/client.go index 2b09580..a1d09f0 100644 --- a/internal/client/client.go +++ b/internal/client/client.go @@ -42,6 +42,12 @@ func NewClient(c *colly.Collector, redirectHandler func(*http.Request, []*http.R func isRequestAllowed(c *colly.Collector, parsedURL *url.URL) bool { u := []byte(parsedURL.String()) + for _, r := range c.URLFilters { + if !r.Match(u) { + return false + } + } + for _, r := range c.DisallowedURLFilters { if r.Match(u) { return false diff --git a/internal/client/client_test.go b/internal/client/client_test.go index bca33cc..7e7cfba 100644 --- a/internal/client/client_test.go +++ b/internal/client/client_test.go @@ -44,6 +44,7 @@ func TestNewClient(t *testing.T) { func TestIsRequestAllowedTableDriven(t *testing.T) { tests := []struct { name string + allowedURLs []*regexp.Regexp disallowedURLs []*regexp.Regexp allowedDomains []string url string @@ -60,6 +61,12 @@ func TestIsRequestAllowedTableDriven(t *testing.T) { url: "http://example.com", expectedAllowed: false, }, + { + name: "URL filter", + allowedURLs: []*regexp.Regexp{regexp.MustCompile("https://www.gov.uk")}, + url: "http://example.com", + expectedAllowed: false, + }, { name: "allowed domain", allowedDomains: []string{"example.com"}, @@ -78,6 +85,7 @@ func TestIsRequestAllowedTableDriven(t *testing.T) { t.Run(tt.name, func(t *testing.T) { c := colly.NewCollector() c.DisallowedURLFilters = tt.disallowedURLs + c.URLFilters = tt.allowedURLs c.AllowedDomains = tt.allowedDomains parsedURL, _ := url.Parse(tt.url) assert.Equal(t, tt.expectedAllowed, isRequestAllowed(c, parsedURL)) diff --git a/internal/config/config.go b/internal/config/config.go index f8c7869..47407d9 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -13,6 +13,7 @@ type Config struct { UserAgent string `env:"USER_AGENT" envDefault:"govukbot"` Headers map[string]string `env:"HEADERS"` Concurrency int `env:"CONCURRENCY" envDefault:"10"` + URLFilters []*regexp.Regexp `env:"URL_RULES" envSeparator:","` DisallowedURLFilters []*regexp.Regexp `env:"DISALLOWED_URL_RULES" envSeparator:","` } diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 9cb7e74..f2cb907 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -29,7 +29,8 @@ func TestNewConfig(t *testing.T) { "USER_AGENT": "custom-agent", "HEADERS": "Test-Header:Test-Value", "CONCURRENCY": "20", - "DISALLOWED_URL_RULES": "rule1,rule2", + "URL_RULES": "rule1,rule2", + "DISALLOWED_URL_RULES": "rule3,rule4", }, expected: &Config{ Site: "example.com", @@ -39,10 +40,14 @@ func TestNewConfig(t *testing.T) { "Test-Header": "Test-Value", }, Concurrency: 20, - DisallowedURLFilters: []*regexp.Regexp{ + URLFilters: []*regexp.Regexp{ regexp.MustCompile("rule1"), regexp.MustCompile("rule2"), }, + DisallowedURLFilters: []*regexp.Regexp{ + regexp.MustCompile("rule3"), + regexp.MustCompile("rule4"), + }, }, }, } diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go index d98afac..0dd8931 100644 --- a/internal/crawler/crawler.go +++ b/internal/crawler/crawler.go @@ -31,6 +31,7 @@ func newCollector(cfg *config.Config) (*colly.Collector, error) { c := colly.NewCollector( colly.UserAgent(cfg.UserAgent), colly.AllowedDomains(cfg.AllowedDomains...), + colly.URLFilters(cfg.URLFilters...), colly.DisallowedURLFilters(cfg.DisallowedURLFilters...), colly.Async(true), ) diff --git a/internal/crawler/crawler_test.go b/internal/crawler/crawler_test.go index cfc4841..3c39525 100644 --- a/internal/crawler/crawler_test.go +++ b/internal/crawler/crawler_test.go @@ -184,6 +184,9 @@ func TestNewCrawler(t *testing.T) { cfg := &config.Config{ UserAgent: "custom-agent", AllowedDomains: []string{"example.com"}, + URLFilters: []*regexp.Regexp{ + regexp.MustCompile(".*"), + }, DisallowedURLFilters: []*regexp.Regexp{ regexp.MustCompile(".*disallowed.*"), }, @@ -196,6 +199,7 @@ func TestNewCrawler(t *testing.T) { assert.IsType(t, &colly.Collector{}, cr.collector) assert.Equal(t, "custom-agent", cr.collector.UserAgent) assert.Equal(t, []string{"example.com"}, cr.collector.AllowedDomains) + assert.Equal(t, []*regexp.Regexp{regexp.MustCompile(".*")}, cr.collector.URLFilters) assert.Equal(t, []*regexp.Regexp{regexp.MustCompile(".*disallowed.*")}, cr.collector.DisallowedURLFilters) assert.Equal(t, true, cr.collector.Async) } @@ -284,6 +288,9 @@ func TestRun(t *testing.T) { cfg := &config.Config{ Site: ts.URL + "/sitemap.xml", AllowedDomains: []string{hostname}, + URLFilters: []*regexp.Regexp{ + regexp.MustCompile(".*"), + }, DisallowedURLFilters: []*regexp.Regexp{ regexp.MustCompile("/disallowed"), },