From c34483955b0e7ead784865cfa6cd7e31c42cfe09 Mon Sep 17 00:00:00 2001 From: Richard Towers Date: Mon, 2 Dec 2024 15:28:41 +0000 Subject: [PATCH] Add positive URL_RULES MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit These are the inverse of DISALLOWED_URL_RULES - the regex must match, otherwise the request is not allowed. This is useful if we want to crawl a small part of a particular site, where all the URLs match a particular pattern. For example, crawling www.gov.uk/government/history with: SITE=https://www.gov.uk/government/history URL_RULES='https://www.gov.uk/government/history' go run ./cmd Gives: www.gov.uk/government/history ├── 1-horse-guards-road.html ├── 10-downing-street.html ├── 11-downing-street.html ├── king-charles-street.html ├── lancaster-house.html ├── past-chancellors.html ├── past-foreign-secretaries │   ├── austen-chamberlain.html │   ├── charles-fox.html │   ├── edward-grey.html │   ├── edward-wood.html │   ├── george-curzon.html │   ├── george-gordon.html │   ├── george-gower.html │   ├── henry-petty-fitzmaurice.html │   ├── robert-cecil.html │   └── william-grenville.html ├── past-foreign-secretaries.html ├── past-prime-ministers │   ├── alec-douglas-home.html │   ├── andrew-bonar-law.html │   ├── anthony-eden.html │   ├── archibald-primrose-5th-earl-of-rosebery.html │   ├── arthur-james-balfour.html │   ├── arthur-wellesley-1st-duke-of-wellington.html │   ├── augustus-henry-fitzroy-3rd-duke-of-grafton.html │   ├── benjamin-disraeli-the-earl-of-beaconsfield.html │   ├── charles-grey-2nd-earl-grey.html │   ├── charles-watson-wentworth-2nd-marquess-of-rockingham.html │   ├── clement-attlee.html │   ├── david-cameron.html │   ├── david-lloyd-george.html │   ├── edward-heath.html │   ├── edward-smith-stanley-14th-earl-of-derby.html │   ├── frederick-north.html │   ├── frederick-robinson-viscount-goderich.html │   ├── george-canning.html │   ├── george-grenville.html │   ├── george-hamilton-gordon-earl-of-aberdeen.html │   ├── gordon-brown.html │   ├── harold-macmillan.html │   ├── harold-wilson.html │   ├── henry-addington-1st-viscount-sidmouth.html │   ├── henry-campbell-bannerman.html │   ├── henry-john-temple-3rd-viscount-palmerston.html │   ├── henry-pelham.html │   ├── herbert-henry-asquith.html │   ├── james-callaghan.html │   ├── james-ramsay-macdonald.html │   ├── john-major.html │   ├── john-stuart-3rd-earl-of-bute.html │   ├── lord-john-russell-1st-earl-russell.html │   ├── margaret-thatcher.html │   ├── neville-chamberlain.html │   ├── robert-banks-jenkinson-earl-of-liverpool.html │   ├── robert-gascoyne-cecil.html │   ├── robert-peel-2nd-baronet.html │   ├── robert-walpole.html │   ├── spencer-compton-1st-earl-of-wilmington.html │   ├── spencer-perceval.html │   ├── stanley-baldwin.html │   ├── theresa-may.html │   ├── thomas-pelham-holles-1st-duke-of-newcastle.html │   ├── tony-blair.html │   ├── william-bentinck-duke-of-portland.html │   ├── william-cavendish-duke-of-devonshire.html │   ├── william-ewart-gladstone.html │   ├── william-lamb-2nd-viscount-melbourne.html │   ├── william-petty-2nd-earl-of-shelburne.html │   ├── william-pitt-1st-earl-of-chatham.html │   ├── william-pitt.html │   ├── william-wyndham-grenville-1st-baron-grenville.html │   └── winston-churchill.html └── past-prime-ministers.html 3 directories, 72 files --- README.md | 2 ++ internal/client/client.go | 6 ++++++ internal/client/client_test.go | 8 ++++++++ internal/config/config.go | 1 + internal/config/config_test.go | 9 +++++++-- internal/crawler/crawler.go | 1 + internal/crawler/crawler_test.go | 7 +++++++ 7 files changed, 32 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index c1fa9b7..3df75ef 100644 --- a/README.md +++ b/README.md @@ -17,5 +17,7 @@ Configuration is handled through environment variables as listed below: - Example: `HEADERS=Rate-Limit-Token:ABC123,X-Header:X-Value` - CONCURRENCY: Controls the number of concurrent requests, useful for controlling request rate. - Example: `CONCURRENCY=10` +- URL_RULES: A comma-separated list of regex patterns matching URLs that the crawler should crawl. All other URLs will be avoided. + - Example: `URL_RULES=https://www.gov.uk/.*` - DISALLOWED_URL_RULES: A comma-separated list of regex patterns matching URLs that the crawler should avoid. - Example: `DISALLOWED_URL_RULES=/search/.*,/government/.*\.atom` diff --git a/internal/client/client.go b/internal/client/client.go index 2b09580..a1d09f0 100644 --- a/internal/client/client.go +++ b/internal/client/client.go @@ -42,6 +42,12 @@ func NewClient(c *colly.Collector, redirectHandler func(*http.Request, []*http.R func isRequestAllowed(c *colly.Collector, parsedURL *url.URL) bool { u := []byte(parsedURL.String()) + for _, r := range c.URLFilters { + if !r.Match(u) { + return false + } + } + for _, r := range c.DisallowedURLFilters { if r.Match(u) { return false diff --git a/internal/client/client_test.go b/internal/client/client_test.go index bca33cc..7e7cfba 100644 --- a/internal/client/client_test.go +++ b/internal/client/client_test.go @@ -44,6 +44,7 @@ func TestNewClient(t *testing.T) { func TestIsRequestAllowedTableDriven(t *testing.T) { tests := []struct { name string + allowedURLs []*regexp.Regexp disallowedURLs []*regexp.Regexp allowedDomains []string url string @@ -60,6 +61,12 @@ func TestIsRequestAllowedTableDriven(t *testing.T) { url: "http://example.com", expectedAllowed: false, }, + { + name: "URL filter", + allowedURLs: []*regexp.Regexp{regexp.MustCompile("https://www.gov.uk")}, + url: "http://example.com", + expectedAllowed: false, + }, { name: "allowed domain", allowedDomains: []string{"example.com"}, @@ -78,6 +85,7 @@ func TestIsRequestAllowedTableDriven(t *testing.T) { t.Run(tt.name, func(t *testing.T) { c := colly.NewCollector() c.DisallowedURLFilters = tt.disallowedURLs + c.URLFilters = tt.allowedURLs c.AllowedDomains = tt.allowedDomains parsedURL, _ := url.Parse(tt.url) assert.Equal(t, tt.expectedAllowed, isRequestAllowed(c, parsedURL)) diff --git a/internal/config/config.go b/internal/config/config.go index f8c7869..47407d9 100644 --- a/internal/config/config.go +++ b/internal/config/config.go @@ -13,6 +13,7 @@ type Config struct { UserAgent string `env:"USER_AGENT" envDefault:"govukbot"` Headers map[string]string `env:"HEADERS"` Concurrency int `env:"CONCURRENCY" envDefault:"10"` + URLFilters []*regexp.Regexp `env:"URL_RULES" envSeparator:","` DisallowedURLFilters []*regexp.Regexp `env:"DISALLOWED_URL_RULES" envSeparator:","` } diff --git a/internal/config/config_test.go b/internal/config/config_test.go index 9cb7e74..f2cb907 100644 --- a/internal/config/config_test.go +++ b/internal/config/config_test.go @@ -29,7 +29,8 @@ func TestNewConfig(t *testing.T) { "USER_AGENT": "custom-agent", "HEADERS": "Test-Header:Test-Value", "CONCURRENCY": "20", - "DISALLOWED_URL_RULES": "rule1,rule2", + "URL_RULES": "rule1,rule2", + "DISALLOWED_URL_RULES": "rule3,rule4", }, expected: &Config{ Site: "example.com", @@ -39,10 +40,14 @@ func TestNewConfig(t *testing.T) { "Test-Header": "Test-Value", }, Concurrency: 20, - DisallowedURLFilters: []*regexp.Regexp{ + URLFilters: []*regexp.Regexp{ regexp.MustCompile("rule1"), regexp.MustCompile("rule2"), }, + DisallowedURLFilters: []*regexp.Regexp{ + regexp.MustCompile("rule3"), + regexp.MustCompile("rule4"), + }, }, }, } diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go index d98afac..0dd8931 100644 --- a/internal/crawler/crawler.go +++ b/internal/crawler/crawler.go @@ -31,6 +31,7 @@ func newCollector(cfg *config.Config) (*colly.Collector, error) { c := colly.NewCollector( colly.UserAgent(cfg.UserAgent), colly.AllowedDomains(cfg.AllowedDomains...), + colly.URLFilters(cfg.URLFilters...), colly.DisallowedURLFilters(cfg.DisallowedURLFilters...), colly.Async(true), ) diff --git a/internal/crawler/crawler_test.go b/internal/crawler/crawler_test.go index cfc4841..3c39525 100644 --- a/internal/crawler/crawler_test.go +++ b/internal/crawler/crawler_test.go @@ -184,6 +184,9 @@ func TestNewCrawler(t *testing.T) { cfg := &config.Config{ UserAgent: "custom-agent", AllowedDomains: []string{"example.com"}, + URLFilters: []*regexp.Regexp{ + regexp.MustCompile(".*"), + }, DisallowedURLFilters: []*regexp.Regexp{ regexp.MustCompile(".*disallowed.*"), }, @@ -196,6 +199,7 @@ func TestNewCrawler(t *testing.T) { assert.IsType(t, &colly.Collector{}, cr.collector) assert.Equal(t, "custom-agent", cr.collector.UserAgent) assert.Equal(t, []string{"example.com"}, cr.collector.AllowedDomains) + assert.Equal(t, []*regexp.Regexp{regexp.MustCompile(".*")}, cr.collector.URLFilters) assert.Equal(t, []*regexp.Regexp{regexp.MustCompile(".*disallowed.*")}, cr.collector.DisallowedURLFilters) assert.Equal(t, true, cr.collector.Async) } @@ -284,6 +288,9 @@ func TestRun(t *testing.T) { cfg := &config.Config{ Site: ts.URL + "/sitemap.xml", AllowedDomains: []string{hostname}, + URLFilters: []*regexp.Regexp{ + regexp.MustCompile(".*"), + }, DisallowedURLFilters: []*regexp.Regexp{ regexp.MustCompile("/disallowed"), },