From c34483955b0e7ead784865cfa6cd7e31c42cfe09 Mon Sep 17 00:00:00 2001
From: Richard Towers <richard.towers@digital.cabinet-office.gov.uk>
Date: Mon, 2 Dec 2024 15:28:41 +0000
Subject: [PATCH] Add positive URL_RULES
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

These are the inverse of DISALLOWED_URL_RULES - the regex must match,
otherwise the request is not allowed.

This is useful if we want to crawl a small part of a particular site,
where all the URLs match a particular pattern.

For example, crawling www.gov.uk/government/history with:

    SITE=https://www.gov.uk/government/history URL_RULES='https://www.gov.uk/government/history' go run ./cmd

Gives:

    www.gov.uk/government/history
    ├── 1-horse-guards-road.html
    ├── 10-downing-street.html
    ├── 11-downing-street.html
    ├── king-charles-street.html
    ├── lancaster-house.html
    ├── past-chancellors.html
    ├── past-foreign-secretaries
    │   ├── austen-chamberlain.html
    │   ├── charles-fox.html
    │   ├── edward-grey.html
    │   ├── edward-wood.html
    │   ├── george-curzon.html
    │   ├── george-gordon.html
    │   ├── george-gower.html
    │   ├── henry-petty-fitzmaurice.html
    │   ├── robert-cecil.html
    │   └── william-grenville.html
    ├── past-foreign-secretaries.html
    ├── past-prime-ministers
    │   ├── alec-douglas-home.html
    │   ├── andrew-bonar-law.html
    │   ├── anthony-eden.html
    │   ├── archibald-primrose-5th-earl-of-rosebery.html
    │   ├── arthur-james-balfour.html
    │   ├── arthur-wellesley-1st-duke-of-wellington.html
    │   ├── augustus-henry-fitzroy-3rd-duke-of-grafton.html
    │   ├── benjamin-disraeli-the-earl-of-beaconsfield.html
    │   ├── charles-grey-2nd-earl-grey.html
    │   ├── charles-watson-wentworth-2nd-marquess-of-rockingham.html
    │   ├── clement-attlee.html
    │   ├── david-cameron.html
    │   ├── david-lloyd-george.html
    │   ├── edward-heath.html
    │   ├── edward-smith-stanley-14th-earl-of-derby.html
    │   ├── frederick-north.html
    │   ├── frederick-robinson-viscount-goderich.html
    │   ├── george-canning.html
    │   ├── george-grenville.html
    │   ├── george-hamilton-gordon-earl-of-aberdeen.html
    │   ├── gordon-brown.html
    │   ├── harold-macmillan.html
    │   ├── harold-wilson.html
    │   ├── henry-addington-1st-viscount-sidmouth.html
    │   ├── henry-campbell-bannerman.html
    │   ├── henry-john-temple-3rd-viscount-palmerston.html
    │   ├── henry-pelham.html
    │   ├── herbert-henry-asquith.html
    │   ├── james-callaghan.html
    │   ├── james-ramsay-macdonald.html
    │   ├── john-major.html
    │   ├── john-stuart-3rd-earl-of-bute.html
    │   ├── lord-john-russell-1st-earl-russell.html
    │   ├── margaret-thatcher.html
    │   ├── neville-chamberlain.html
    │   ├── robert-banks-jenkinson-earl-of-liverpool.html
    │   ├── robert-gascoyne-cecil.html
    │   ├── robert-peel-2nd-baronet.html
    │   ├── robert-walpole.html
    │   ├── spencer-compton-1st-earl-of-wilmington.html
    │   ├── spencer-perceval.html
    │   ├── stanley-baldwin.html
    │   ├── theresa-may.html
    │   ├── thomas-pelham-holles-1st-duke-of-newcastle.html
    │   ├── tony-blair.html
    │   ├── william-bentinck-duke-of-portland.html
    │   ├── william-cavendish-duke-of-devonshire.html
    │   ├── william-ewart-gladstone.html
    │   ├── william-lamb-2nd-viscount-melbourne.html
    │   ├── william-petty-2nd-earl-of-shelburne.html
    │   ├── william-pitt-1st-earl-of-chatham.html
    │   ├── william-pitt.html
    │   ├── william-wyndham-grenville-1st-baron-grenville.html
    │   └── winston-churchill.html
    └── past-prime-ministers.html

3 directories, 72 files
---
 README.md                        | 2 ++
 internal/client/client.go        | 6 ++++++
 internal/client/client_test.go   | 8 ++++++++
 internal/config/config.go        | 1 +
 internal/config/config_test.go   | 9 +++++++--
 internal/crawler/crawler.go      | 1 +
 internal/crawler/crawler_test.go | 7 +++++++
 7 files changed, 32 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index c1fa9b7..3df75ef 100644
--- a/README.md
+++ b/README.md
@@ -17,5 +17,7 @@ Configuration is handled through environment variables as listed below:
     - Example: `HEADERS=Rate-Limit-Token:ABC123,X-Header:X-Value`
 - CONCURRENCY: Controls the number of concurrent requests, useful for controlling request rate.
     - Example: `CONCURRENCY=10`
+- URL_RULES: A comma-separated list of regex patterns matching URLs that the crawler should crawl. All other URLs will be avoided.
+    - Example: `URL_RULES=https://www.gov.uk/.*`
 - DISALLOWED_URL_RULES: A comma-separated list of regex patterns matching URLs that the crawler should avoid.
     - Example: `DISALLOWED_URL_RULES=/search/.*,/government/.*\.atom`
diff --git a/internal/client/client.go b/internal/client/client.go
index 2b09580..a1d09f0 100644
--- a/internal/client/client.go
+++ b/internal/client/client.go
@@ -42,6 +42,12 @@ func NewClient(c *colly.Collector, redirectHandler func(*http.Request, []*http.R
 func isRequestAllowed(c *colly.Collector, parsedURL *url.URL) bool {
 	u := []byte(parsedURL.String())
 
+	for _, r := range c.URLFilters {
+		if !r.Match(u) {
+			return false
+		}
+	}
+
 	for _, r := range c.DisallowedURLFilters {
 		if r.Match(u) {
 			return false
diff --git a/internal/client/client_test.go b/internal/client/client_test.go
index bca33cc..7e7cfba 100644
--- a/internal/client/client_test.go
+++ b/internal/client/client_test.go
@@ -44,6 +44,7 @@ func TestNewClient(t *testing.T) {
 func TestIsRequestAllowedTableDriven(t *testing.T) {
 	tests := []struct {
 		name            string
+		allowedURLs     []*regexp.Regexp
 		disallowedURLs  []*regexp.Regexp
 		allowedDomains  []string
 		url             string
@@ -60,6 +61,12 @@ func TestIsRequestAllowedTableDriven(t *testing.T) {
 			url:             "http://example.com",
 			expectedAllowed: false,
 		},
+		{
+			name:            "URL filter",
+			allowedURLs:     []*regexp.Regexp{regexp.MustCompile("https://www.gov.uk")},
+			url:             "http://example.com",
+			expectedAllowed: false,
+		},
 		{
 			name:            "allowed domain",
 			allowedDomains:  []string{"example.com"},
@@ -78,6 +85,7 @@ func TestIsRequestAllowedTableDriven(t *testing.T) {
 		t.Run(tt.name, func(t *testing.T) {
 			c := colly.NewCollector()
 			c.DisallowedURLFilters = tt.disallowedURLs
+			c.URLFilters = tt.allowedURLs
 			c.AllowedDomains = tt.allowedDomains
 			parsedURL, _ := url.Parse(tt.url)
 			assert.Equal(t, tt.expectedAllowed, isRequestAllowed(c, parsedURL))
diff --git a/internal/config/config.go b/internal/config/config.go
index f8c7869..47407d9 100644
--- a/internal/config/config.go
+++ b/internal/config/config.go
@@ -13,6 +13,7 @@ type Config struct {
 	UserAgent            string            `env:"USER_AGENT" envDefault:"govukbot"`
 	Headers              map[string]string `env:"HEADERS"`
 	Concurrency          int               `env:"CONCURRENCY" envDefault:"10"`
+	URLFilters           []*regexp.Regexp  `env:"URL_RULES" envSeparator:","`
 	DisallowedURLFilters []*regexp.Regexp  `env:"DISALLOWED_URL_RULES" envSeparator:","`
 }
 
diff --git a/internal/config/config_test.go b/internal/config/config_test.go
index 9cb7e74..f2cb907 100644
--- a/internal/config/config_test.go
+++ b/internal/config/config_test.go
@@ -29,7 +29,8 @@ func TestNewConfig(t *testing.T) {
 				"USER_AGENT":           "custom-agent",
 				"HEADERS":              "Test-Header:Test-Value",
 				"CONCURRENCY":          "20",
-				"DISALLOWED_URL_RULES": "rule1,rule2",
+				"URL_RULES":            "rule1,rule2",
+				"DISALLOWED_URL_RULES": "rule3,rule4",
 			},
 			expected: &Config{
 				Site:           "example.com",
@@ -39,10 +40,14 @@ func TestNewConfig(t *testing.T) {
 					"Test-Header": "Test-Value",
 				},
 				Concurrency: 20,
-				DisallowedURLFilters: []*regexp.Regexp{
+				URLFilters: []*regexp.Regexp{
 					regexp.MustCompile("rule1"),
 					regexp.MustCompile("rule2"),
 				},
+				DisallowedURLFilters: []*regexp.Regexp{
+					regexp.MustCompile("rule3"),
+					regexp.MustCompile("rule4"),
+				},
 			},
 		},
 	}
diff --git a/internal/crawler/crawler.go b/internal/crawler/crawler.go
index d98afac..0dd8931 100644
--- a/internal/crawler/crawler.go
+++ b/internal/crawler/crawler.go
@@ -31,6 +31,7 @@ func newCollector(cfg *config.Config) (*colly.Collector, error) {
 	c := colly.NewCollector(
 		colly.UserAgent(cfg.UserAgent),
 		colly.AllowedDomains(cfg.AllowedDomains...),
+		colly.URLFilters(cfg.URLFilters...),
 		colly.DisallowedURLFilters(cfg.DisallowedURLFilters...),
 		colly.Async(true),
 	)
diff --git a/internal/crawler/crawler_test.go b/internal/crawler/crawler_test.go
index cfc4841..3c39525 100644
--- a/internal/crawler/crawler_test.go
+++ b/internal/crawler/crawler_test.go
@@ -184,6 +184,9 @@ func TestNewCrawler(t *testing.T) {
 	cfg := &config.Config{
 		UserAgent:      "custom-agent",
 		AllowedDomains: []string{"example.com"},
+		URLFilters: []*regexp.Regexp{
+			regexp.MustCompile(".*"),
+		},
 		DisallowedURLFilters: []*regexp.Regexp{
 			regexp.MustCompile(".*disallowed.*"),
 		},
@@ -196,6 +199,7 @@ func TestNewCrawler(t *testing.T) {
 	assert.IsType(t, &colly.Collector{}, cr.collector)
 	assert.Equal(t, "custom-agent", cr.collector.UserAgent)
 	assert.Equal(t, []string{"example.com"}, cr.collector.AllowedDomains)
+	assert.Equal(t, []*regexp.Regexp{regexp.MustCompile(".*")}, cr.collector.URLFilters)
 	assert.Equal(t, []*regexp.Regexp{regexp.MustCompile(".*disallowed.*")}, cr.collector.DisallowedURLFilters)
 	assert.Equal(t, true, cr.collector.Async)
 }
@@ -284,6 +288,9 @@ func TestRun(t *testing.T) {
 	cfg := &config.Config{
 		Site:           ts.URL + "/sitemap.xml",
 		AllowedDomains: []string{hostname},
+		URLFilters: []*regexp.Regexp{
+			regexp.MustCompile(".*"),
+		},
 		DisallowedURLFilters: []*regexp.Regexp{
 			regexp.MustCompile("/disallowed"),
 		},