From 0efa519e2b142ad896db1e9402c399a40db8333b Mon Sep 17 00:00:00 2001
From: Jerry Ng <ngshiheng@gmail.com>
Date: Tue, 4 Jun 2024 09:49:41 +0800
Subject: [PATCH] feat: use queue for managing requests

---
 pkg/crawler/crawler.go | 52 +++++++++++++++++++++++++++++++++---------
 1 file changed, 41 insertions(+), 11 deletions(-)

diff --git a/pkg/crawler/crawler.go b/pkg/crawler/crawler.go
index f976a02..abdccad 100644
--- a/pkg/crawler/crawler.go
+++ b/pkg/crawler/crawler.go
@@ -1,13 +1,13 @@
 package crawler
 
 import (
-	"net/http"
 	"path/filepath"
 	"strings"
 	"time"
 
 	"github.com/gocolly/colly/v2"
 	"github.com/gocolly/colly/v2/extensions"
+	"github.com/gocolly/colly/v2/queue"
 	"github.com/ngshiheng/michelin-my-maps/v2/pkg/logger"
 	"github.com/ngshiheng/michelin-my-maps/v2/pkg/michelin"
 	"github.com/ngshiheng/michelin-my-maps/v2/pkg/parser"
@@ -17,18 +17,26 @@ import (
 )
 
 const (
+	// Colly collector settings
 	allowedDomain = "guide.michelin.com"
 	cachePath     = "cache"
-	delay         = 2 * time.Second
-	parallelism   = 5
-	randomDelay   = 2 * time.Second
-	sqlitePath    = "michelin.db"
+	delay         = 1 * time.Second
+	randomDelay   = 4 * time.Second
+	parallelism   = 2
+
+	// Colly queue settings
+	threadCount = 5
+	urlCount    = 20_000 // There are currently ~17k restaurants on Michelin Guide as of Jun 2024
+
+	// SQLite database settings
+	sqlitePath = "michelin.db"
 )
 
 // App contains the necessary components for the crawler.
 type App struct {
 	collector    *colly.Collector
 	database     *gorm.DB
+	queue        *queue.Queue
 	michelinURLs []michelin.GuideURL
 }
 
@@ -38,6 +46,7 @@ func Default() *App {
 	a.initDefaultURLs()
 	a.initDefaultCollector()
 	a.initDefaultDatabase()
+	a.initDefaultQueue()
 	return a
 }
 
@@ -53,6 +62,7 @@ func New(distinction string, db *gorm.DB) *App {
 		michelinURLs: []michelin.GuideURL{url},
 	}
 	a.initDefaultCollector()
+	a.initDefaultQueue()
 	return a
 }
 
@@ -91,9 +101,9 @@ func (a *App) initDefaultCollector() {
 	)
 
 	c.Limit(&colly.LimitRule{
-		Parallelism: parallelism,
 		Delay:       delay,
 		RandomDelay: randomDelay,
+		Parallelism: parallelism,
 	})
 
 	extensions.RandomUserAgent(c)
@@ -146,6 +156,18 @@ func (a *App) initDefaultDatabase() {
 	a.database = db
 }
 
+// Initialize the default queue.
+func (a *App) initDefaultQueue() {
+	q, err := queue.New(
+		threadCount,
+		&queue.InMemoryQueueStorage{MaxSize: urlCount},
+	)
+	if err != nil {
+		log.Fatal("failed to create queue:", err)
+	}
+	a.queue = q
+}
+
 // Crawl crawls Michelin Guide Restaurants information from a.michelinURLs.
 func (a *App) Crawl() {
 	defer logger.TimeTrack(time.Now(), "crawl")
@@ -161,6 +183,16 @@ func (a *App) Crawl() {
 		log.Debug("finished: ", r.Request.URL)
 	})
 
+	a.collector.OnRequest(func(r *colly.Request) {
+		log.Debug("visiting: ", r.URL)
+		a.queue.AddRequest(r)
+	})
+
+	dc.OnRequest(func(r *colly.Request) {
+		log.Debug("visiting: ", r.URL)
+		a.queue.AddRequest(r)
+	})
+
 	// Extract url of each restaurant from the main page and visit them
 	a.collector.OnXML(restaurantXPath, func(e *colly.XMLElement) {
 		url := e.Request.AbsoluteURL(e.ChildAttr(restaurantDetailUrlXPath, "href"))
@@ -242,12 +274,10 @@ func (a *App) Crawl() {
 		a.database.Create(&restaurant)
 	})
 
-	// Start scraping
 	for _, url := range a.michelinURLs {
-		ctx := colly.NewContext()
-		a.collector.Request(http.MethodGet, url.URL, nil, ctx, nil)
+		a.queue.AddURL(url.URL)
 	}
 
-	a.collector.Wait()
-	dc.Wait()
+	// Start scraping
+	a.queue.Run(a.collector)
 }