From c50a8b430a8b2d10274a986c1ad1393a6f4792ba Mon Sep 17 00:00:00 2001 From: Giorgi Kavrelishvili Date: Fri, 22 Dec 2023 12:26:00 +0400 Subject: [PATCH] Modify requirements to enable specific chaches, update examples --- examples/application.cr | 30 ++++++++++--- examples/human_resources/spider.cr | 19 ++++---- examples/resourceful_humans/spider.cr | 20 ++++----- shard.yml | 3 +- src/squirm.cr | 6 ++- src/squirm/caches/redis.cr | 1 + src/squirm/caches/rocksdb.cr | 1 + src/squirm/engine.cr | 62 ++++++++++++++------------- src/squirm/request_storage.cr | 55 +++++++++++++++--------- src/squirm/spider.cr | 2 +- 10 files changed, 118 insertions(+), 81 deletions(-) diff --git a/examples/application.cr b/examples/application.cr index cae9b57..1c1d831 100644 --- a/examples/application.cr +++ b/examples/application.cr @@ -1,3 +1,4 @@ +require "../src/squirm/caches/redis" require "../src/squirm" require "./human_resources/**" require "./resourceful_humans/**" @@ -6,14 +7,29 @@ Log.setup(:debug) engine = Squirm::Engine.new -engine.add_spider(HumanResources::Spider.new) -engine.add_spider(ResourcefulHumans::Spider.new) +spiders = [ + HumanResources::Spider.new, + ResourcefulHumans::Spider.new, +] of Squirm::Spider + +spiders.each do |spider| + engine.add_spider(spider) +end + +engine.run loop do - sleep 60 + spiders.each do |spider| + unless Squirm::RequestStorage.instance.empty?(spider.id) + size = Squirm::RequestStorage + .instance + .requests + .[spider.id] + .size - engine.spiders.each do |spider| - queue_size = Squirm::RequestStorage.instance.requests[spider.id].size - Log.info { "Spider #{spider.id} is running and has queued #{queue_size} requests." } if queue_size != 0 + Log.debug { "#{spider.id} running with #{size} request(s)" } + end end -end + + sleep 30 +end \ No newline at end of file diff --git a/examples/human_resources/spider.cr b/examples/human_resources/spider.cr index 7d7f4c3..7c66f2f 100644 --- a/examples/human_resources/spider.cr +++ b/examples/human_resources/spider.cr @@ -15,7 +15,10 @@ module HumanResources property start_urls : Array(String) = ["https://www.hr.gov.ge/?pageNo=1"] # Caching mechanism used by the spider to cache the requests in case of a restart/failure. - property cache : Squirm::Caches::Base = Squirm::Caches::RocksDB.new(@@id) + property cache : Squirm::Caches::Base = Squirm::Caches::Redis.new(@@id) + + # Used by the engine to fetch the URLs. + property fetcher : Squirm::Fetchers::Base = Squirm::Fetchers::Default.new # Parser used by the spider to parse the HTML content. property parser : Squirm::Parser = Parser.new @@ -26,21 +29,19 @@ module HumanResources # Used by the spider to filter the responses. property response_filters : Array(Squirm::ResponseFilters::Base) = [Squirm::ResponseFilters::ContentValidator.new(selector: ".Title-box")] of Squirm::ResponseFilters::Base - # Time spent between each request - property timeout : Time::Span = 5.seconds + # Time spent between each request. + property request_timeout : Time::Span = 5.seconds - # Concurrent requests per domain - property concurrent_requests_per_domain : Int32 = 5 + # Concurrent requests per domain. + property concurrent_requests_per_domain : Int32 = 2 - # Used by the engine to fetch the URLs - property fetcher : Squirm::Fetchers::Base = Squirm::Fetchers::Default.new # Used by the caching mechanism to retrieve the requests from the cache. def start_requests : Array(Squirm::Request) cache.list_requests!(base_url()) end - # Parsing logic to identify the listing URLs and pagination URLs + # Parsing logic to identify the listing URLs and pagination URLs. def parse_item(request : Squirm::Request, response : Squirm::Response) : Squirm::ParsedItem cache.delete!(request.url) @@ -76,7 +77,7 @@ module HumanResources .map { |href| Squirm::Utils.build_absolute_url(href, base_url) } end - # Parse HTML for pagination URLs + # Parse HTML for pagination URLs. def pagination_urls(document : Lexbor::Parser) : Array(String) document .find("li.PagedList-skipToNext a") diff --git a/examples/resourceful_humans/spider.cr b/examples/resourceful_humans/spider.cr index a26e155..2be368c 100644 --- a/examples/resourceful_humans/spider.cr +++ b/examples/resourceful_humans/spider.cr @@ -17,6 +17,9 @@ module ResourcefulHumans # Caching mechanism used by the spider to cache the requests in case of a restart/failure. property cache : Squirm::Caches::Base = Squirm::Caches::Redis.new(@@id) + # If you want to use the Chrome fetcher add the chromedriver to your PATH. + property fetcher : Squirm::Fetchers::Base = Squirm::Fetchers::Default.new + # Parser used by the spider to parse the HTML content. property parser : Squirm::Parser = Parser.new @@ -26,23 +29,18 @@ module ResourcefulHumans # Used by the spider to filter the responses. property response_filters : Array(Squirm::ResponseFilters::Base) = [Squirm::ResponseFilters::ContentValidator.new(selector: ".ann-title")] of Squirm::ResponseFilters::Base - # Time spent between each request - property timeout : Time::Span = 5.seconds - - # Concurrent requests per domain - property concurrent_requests_per_domain : Int32 = 5 + # Time spent between each request. + property request_timeout : Time::Span = 5.seconds - # - # If you want to use the Chrome fetcher add the chromedriver to your PATH - # - property fetcher : Squirm::Fetchers::Base = Squirm::Fetchers::Chrome.new + # Concurrent requests per domain. + property concurrent_requests_per_domain : Int32 = 2 # Used by the caching mechanism to retrieve the requests from the cache. def start_requests : Array(Squirm::Request) cache.list_requests!(base_url()) end - # Parsing logic to identify the listing URLs and pagination URLs + # Parsing logic to identify the listing URLs and pagination URLs. def parse_item(request : Squirm::Request, response : Squirm::Response) : Squirm::ParsedItem cache.delete!(request.url) @@ -78,7 +76,7 @@ module ResourcefulHumans .map { |href| Squirm::Utils.build_absolute_url(href, base_url) } end - # Parse HTML for pagination URLs + # Parse HTML for pagination URLs. def pagination_urls(document : Lexbor::Parser) : Array(String) document .find(".paging-container a.item") diff --git a/shard.yml b/shard.yml index 924793b..a89a048 100644 --- a/shard.yml +++ b/shard.yml @@ -1,5 +1,5 @@ name: squirm -version: 0.1.1 +version: 0.2.0 authors: - Giorgi Kavrelishvili @@ -29,7 +29,6 @@ dependencies: development_dependencies: ameba: github: crystal-ameba/ameba - version: ~> 0.13.0 crystal: ~> 1.2.0 diff --git a/src/squirm.cr b/src/squirm.cr index a284050..059c800 100644 --- a/src/squirm.cr +++ b/src/squirm.cr @@ -3,7 +3,11 @@ require "lexbor" require "robots" require "log" -require "./squirm/**" +require "./squirm/fetchers/**" +require "./squirm/ext/**" +require "./squirm/request_filters/**" +require "./squirm/response_filters/**" +require "./squirm/*" module Squirm {% unless flag?(:preview_mt) %} diff --git a/src/squirm/caches/redis.cr b/src/squirm/caches/redis.cr index 345a8c7..90c8538 100644 --- a/src/squirm/caches/redis.cr +++ b/src/squirm/caches/redis.cr @@ -1,3 +1,4 @@ +require "./base" require "redis" module Squirm diff --git a/src/squirm/caches/rocksdb.cr b/src/squirm/caches/rocksdb.cr index 5dd4958..e90c091 100644 --- a/src/squirm/caches/rocksdb.cr +++ b/src/squirm/caches/rocksdb.cr @@ -1,3 +1,4 @@ +require "./base" require "rocksdb" module Squirm diff --git a/src/squirm/engine.cr b/src/squirm/engine.cr index efe4234..857176f 100644 --- a/src/squirm/engine.cr +++ b/src/squirm/engine.cr @@ -5,44 +5,48 @@ module Squirm getter spiders : Synchronized(Array(Spider)) = Synchronized(Array(Spider)).new def add_spider(spider : Spider) - RequestStorage.instance.store(spider, spider.start_urls) if spider.start_requests.empty? - RequestStorage.instance.store(spider, spider.start_requests) + RequestStorage.instance.store(spider.id, spider.start_urls) if spider.start_requests.empty? + RequestStorage.instance.store(spider.id, spider.start_requests) @spiders.push(spider) + end - spawn do - pool = Pool.new(spider.concurrent_requests_per_domain) - fetcher = spider.fetcher + def run + spiders.each do |spider| + spawn do + pool = Pool.new(spider.concurrent_requests_per_domain) + fetcher = spider.fetcher - loop do - unless RequestStorage.instance.empty?(spider) - request = RequestStorage.instance.pop!(spider) - request.spider = spider + loop do + unless RequestStorage.instance.empty?(spider.id) + request = RequestStorage.instance.pop!(spider.id) + request.spider = spider - pool.spawn do - begin - response = fetcher.fetch(request) + pool.spawn do + begin + response = fetcher.fetch(request) - parsed_item = spider.parse_item(request, response) - parse(spider, parsed_item) + parsed_item = spider.parse_item(request, response) + parse(spider, parsed_item) - sleep(spider.timeout) - rescue exception : Crest::RequestFailed - status_code = exception.response.status_code.to_i + sleep(spider.request_timeout) + rescue exception : Crest::RequestFailed + status_code = exception.response.status_code.to_i - case status_code - when 429, 500..511 - Log.error(exception: exception) { exception.message } + case status_code + when 429, 500..511 + Log.error(exception: exception) { exception.message } - if request.retriable? - request.retry - RequestStorage.instance.store(spider, request) + if request.retriable? + request.retry + RequestStorage.instance.store(spider.id, request) + end + else + Log.error(exception: exception) { "Dropping the request, failed to get a response status code which could be used to recover a request." } end - else - Log.error(exception: exception) { "Dropping the request, failed to get a response status code which could be used to recover a request." } + rescue exception : Exception + Log.error(exception: exception) { "Dropping the request, a non HTTP error occured." } end - rescue exception : Exception - Log.error(exception: exception) { "Dropping the request, a non HTTP error occured." } end end end @@ -52,7 +56,7 @@ module Squirm def remove_spider(spider : Spider) spider.cache.flush - RequestStorage.instance.flush(spider) + RequestStorage.instance.flush(spider.id) @spiders.delete(spider) end @@ -62,7 +66,7 @@ module Squirm end private def parse_requests(spider : Spider, parsed_item : ParsedItem) - RequestStorage.instance.store(spider, parsed_item.requests) + RequestStorage.instance.store(spider.id, parsed_item.requests) end private def parse_items(spider : Spider, parsed_item : ParsedItem) diff --git a/src/squirm/request_storage.cr b/src/squirm/request_storage.cr index 740d41c..3450604 100644 --- a/src/squirm/request_storage.cr +++ b/src/squirm/request_storage.cr @@ -11,48 +11,61 @@ module Squirm getter requests : Synchronized(Hash(String, Array(Request))) = Synchronized(Hash(String, Array(Request))).new getter history : Synchronized(Hash(String, Array(String))) = Synchronized(Hash(String, Array(String))).new - def store(spider : Spider, request : Request) - if @requests.has_key?(spider.id) - unless @history[spider.id].includes?(request.url) - @history[spider.id].push(request.url) - @requests[spider.id].push(request) + def store(id : String, request : Request) + if @requests.has_key?(id) + unless @history[id].includes?(request.url) + @history[id].push(request.url) + @requests[id].push(request) end else - @history[spider.id] = [request.url] - @requests[spider.id] = [request] + @history[id] = [request.url] + @requests[id] = [request] end end - def store(spider : Spider, requests : Array(Request)) + def store(id : String, requests : Array(Request)) requests.each do |request| - store(spider, request) + store(id, request) end end - def store(spider : Spider, url : String) - store(spider, Request.new(:get, url)) + def store(id : String, url : String) + store(id, Request.new(:get, url)) end - def store(spider : Spider, urls : Array(String)) + def store(id : String, urls : Array(String)) urls.each do |url| - store(spider, url) + store(id, url) end end - def pop!(spider : Spider) : Request - @requests[spider.id].pop + def flush(id : String) + @requests[id] = [] of Request end - def pop?(spider : Spider) : Request? - @requests[spider.id].pop? + def clear(id : String) + @requests[id] = [] of Request + @history[id] = [] of String end - def flush(spider : Spider) - @requests[spider.id] = [] of Request + def pop!(id : String) + @requests[id].pop? || raise Exception.new("Request storage is empty") end - def empty?(spider : Spider) : Bool - @requests[spider.id].empty? + def delete_history(id : String, url : String) + @history[id].delete(url) + end + + def seen?(id : String, url : String) : Bool + @history[id].includes?(url) + end + + def empty?(id : String) : Bool + @requests[id].empty? + end + + def exists?(id : String) : Bool + @requests[id]? != nil || @history[id]? != nil end end end diff --git a/src/squirm/spider.cr b/src/squirm/spider.cr index cf86b11..cf0542c 100644 --- a/src/squirm/spider.cr +++ b/src/squirm/spider.cr @@ -12,7 +12,7 @@ module Squirm abstract def parse_item(request : Request, response : Response) : ParsedItem abstract def request_filters : Array(RequestFilters::Base) abstract def response_filters : Array(ResponseFilters::Base) - abstract def timeout : Time::Span + abstract def request_timeout : Time::Span abstract def concurrent_requests_per_domain : Int32 end end