Skip to content

Commit

Permalink
Modify requirements to enable specific chaches, update examples
Browse files Browse the repository at this point in the history
  • Loading branch information
grkek committed Dec 22, 2023
1 parent a587147 commit c50a8b4
Show file tree
Hide file tree
Showing 10 changed files with 118 additions and 81 deletions.
30 changes: 23 additions & 7 deletions examples/application.cr
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
require "../src/squirm/caches/redis"
require "../src/squirm"
require "./human_resources/**"
require "./resourceful_humans/**"
Expand All @@ -6,14 +7,29 @@ Log.setup(:debug)

engine = Squirm::Engine.new

engine.add_spider(HumanResources::Spider.new)
engine.add_spider(ResourcefulHumans::Spider.new)
spiders = [
HumanResources::Spider.new,
ResourcefulHumans::Spider.new,
] of Squirm::Spider

spiders.each do |spider|
engine.add_spider(spider)
end

engine.run

loop do
sleep 60
spiders.each do |spider|
unless Squirm::RequestStorage.instance.empty?(spider.id)
size = Squirm::RequestStorage
.instance
.requests
.[spider.id]
.size

engine.spiders.each do |spider|
queue_size = Squirm::RequestStorage.instance.requests[spider.id].size
Log.info { "Spider #{spider.id} is running and has queued #{queue_size} requests." } if queue_size != 0
Log.debug { "#{spider.id} running with #{size} request(s)" }
end
end
end

sleep 30
end
19 changes: 10 additions & 9 deletions examples/human_resources/spider.cr
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,10 @@ module HumanResources
property start_urls : Array(String) = ["https://www.hr.gov.ge/?pageNo=1"]

# Caching mechanism used by the spider to cache the requests in case of a restart/failure.
property cache : Squirm::Caches::Base = Squirm::Caches::RocksDB.new(@@id)
property cache : Squirm::Caches::Base = Squirm::Caches::Redis.new(@@id)

# Used by the engine to fetch the URLs.
property fetcher : Squirm::Fetchers::Base = Squirm::Fetchers::Default.new

# Parser used by the spider to parse the HTML content.
property parser : Squirm::Parser = Parser.new
Expand All @@ -26,21 +29,19 @@ module HumanResources
# Used by the spider to filter the responses.
property response_filters : Array(Squirm::ResponseFilters::Base) = [Squirm::ResponseFilters::ContentValidator.new(selector: ".Title-box")] of Squirm::ResponseFilters::Base

# Time spent between each request
property timeout : Time::Span = 5.seconds
# Time spent between each request.
property request_timeout : Time::Span = 5.seconds

# Concurrent requests per domain
property concurrent_requests_per_domain : Int32 = 5
# Concurrent requests per domain.
property concurrent_requests_per_domain : Int32 = 2

# Used by the engine to fetch the URLs
property fetcher : Squirm::Fetchers::Base = Squirm::Fetchers::Default.new

# Used by the caching mechanism to retrieve the requests from the cache.
def start_requests : Array(Squirm::Request)
cache.list_requests!(base_url())
end

# Parsing logic to identify the listing URLs and pagination URLs
# Parsing logic to identify the listing URLs and pagination URLs.
def parse_item(request : Squirm::Request, response : Squirm::Response) : Squirm::ParsedItem
cache.delete!(request.url)

Expand Down Expand Up @@ -76,7 +77,7 @@ module HumanResources
.map { |href| Squirm::Utils.build_absolute_url(href, base_url) }
end

# Parse HTML for pagination URLs
# Parse HTML for pagination URLs.
def pagination_urls(document : Lexbor::Parser) : Array(String)
document
.find("li.PagedList-skipToNext a")
Expand Down
20 changes: 9 additions & 11 deletions examples/resourceful_humans/spider.cr
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@ module ResourcefulHumans
# Caching mechanism used by the spider to cache the requests in case of a restart/failure.
property cache : Squirm::Caches::Base = Squirm::Caches::Redis.new(@@id)

# If you want to use the Chrome fetcher add the chromedriver to your PATH.
property fetcher : Squirm::Fetchers::Base = Squirm::Fetchers::Default.new

# Parser used by the spider to parse the HTML content.
property parser : Squirm::Parser = Parser.new

Expand All @@ -26,23 +29,18 @@ module ResourcefulHumans
# Used by the spider to filter the responses.
property response_filters : Array(Squirm::ResponseFilters::Base) = [Squirm::ResponseFilters::ContentValidator.new(selector: ".ann-title")] of Squirm::ResponseFilters::Base

# Time spent between each request
property timeout : Time::Span = 5.seconds

# Concurrent requests per domain
property concurrent_requests_per_domain : Int32 = 5
# Time spent between each request.
property request_timeout : Time::Span = 5.seconds

#
# If you want to use the Chrome fetcher add the chromedriver to your PATH
#
property fetcher : Squirm::Fetchers::Base = Squirm::Fetchers::Chrome.new
# Concurrent requests per domain.
property concurrent_requests_per_domain : Int32 = 2

# Used by the caching mechanism to retrieve the requests from the cache.
def start_requests : Array(Squirm::Request)
cache.list_requests!(base_url())
end

# Parsing logic to identify the listing URLs and pagination URLs
# Parsing logic to identify the listing URLs and pagination URLs.
def parse_item(request : Squirm::Request, response : Squirm::Response) : Squirm::ParsedItem
cache.delete!(request.url)

Expand Down Expand Up @@ -78,7 +76,7 @@ module ResourcefulHumans
.map { |href| Squirm::Utils.build_absolute_url(href, base_url) }
end

# Parse HTML for pagination URLs
# Parse HTML for pagination URLs.
def pagination_urls(document : Lexbor::Parser) : Array(String)
document
.find(".paging-container a.item")
Expand Down
3 changes: 1 addition & 2 deletions shard.yml
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
name: squirm
version: 0.1.1
version: 0.2.0

authors:
- Giorgi Kavrelishvili <giorgi.kavrelishvili@pm.me>
Expand Down Expand Up @@ -29,7 +29,6 @@ dependencies:
development_dependencies:
ameba:
github: crystal-ameba/ameba
version: ~> 0.13.0

crystal: ~> 1.2.0

Expand Down
6 changes: 5 additions & 1 deletion src/squirm.cr
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,11 @@ require "lexbor"
require "robots"
require "log"

require "./squirm/**"
require "./squirm/fetchers/**"
require "./squirm/ext/**"
require "./squirm/request_filters/**"
require "./squirm/response_filters/**"
require "./squirm/*"

module Squirm
{% unless flag?(:preview_mt) %}
Expand Down
1 change: 1 addition & 0 deletions src/squirm/caches/redis.cr
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
require "./base"
require "redis"

module Squirm
Expand Down
1 change: 1 addition & 0 deletions src/squirm/caches/rocksdb.cr
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
require "./base"
require "rocksdb"

module Squirm
Expand Down
62 changes: 33 additions & 29 deletions src/squirm/engine.cr
Original file line number Diff line number Diff line change
Expand Up @@ -5,44 +5,48 @@ module Squirm
getter spiders : Synchronized(Array(Spider)) = Synchronized(Array(Spider)).new

def add_spider(spider : Spider)
RequestStorage.instance.store(spider, spider.start_urls) if spider.start_requests.empty?
RequestStorage.instance.store(spider, spider.start_requests)
RequestStorage.instance.store(spider.id, spider.start_urls) if spider.start_requests.empty?
RequestStorage.instance.store(spider.id, spider.start_requests)

@spiders.push(spider)
end

spawn do
pool = Pool.new(spider.concurrent_requests_per_domain)
fetcher = spider.fetcher
def run
spiders.each do |spider|
spawn do
pool = Pool.new(spider.concurrent_requests_per_domain)
fetcher = spider.fetcher

loop do
unless RequestStorage.instance.empty?(spider)
request = RequestStorage.instance.pop!(spider)
request.spider = spider
loop do
unless RequestStorage.instance.empty?(spider.id)
request = RequestStorage.instance.pop!(spider.id)
request.spider = spider

pool.spawn do
begin
response = fetcher.fetch(request)
pool.spawn do
begin
response = fetcher.fetch(request)

parsed_item = spider.parse_item(request, response)
parse(spider, parsed_item)
parsed_item = spider.parse_item(request, response)
parse(spider, parsed_item)

sleep(spider.timeout)
rescue exception : Crest::RequestFailed
status_code = exception.response.status_code.to_i
sleep(spider.request_timeout)
rescue exception : Crest::RequestFailed
status_code = exception.response.status_code.to_i

case status_code
when 429, 500..511
Log.error(exception: exception) { exception.message }
case status_code
when 429, 500..511
Log.error(exception: exception) { exception.message }

if request.retriable?
request.retry
RequestStorage.instance.store(spider, request)
if request.retriable?
request.retry
RequestStorage.instance.store(spider.id, request)
end
else
Log.error(exception: exception) { "Dropping the request, failed to get a response status code which could be used to recover a request." }
end
else
Log.error(exception: exception) { "Dropping the request, failed to get a response status code which could be used to recover a request." }
rescue exception : Exception
Log.error(exception: exception) { "Dropping the request, a non HTTP error occured." }
end
rescue exception : Exception
Log.error(exception: exception) { "Dropping the request, a non HTTP error occured." }
end
end
end
Expand All @@ -52,7 +56,7 @@ module Squirm

def remove_spider(spider : Spider)
spider.cache.flush
RequestStorage.instance.flush(spider)
RequestStorage.instance.flush(spider.id)
@spiders.delete(spider)
end

Expand All @@ -62,7 +66,7 @@ module Squirm
end

private def parse_requests(spider : Spider, parsed_item : ParsedItem)
RequestStorage.instance.store(spider, parsed_item.requests)
RequestStorage.instance.store(spider.id, parsed_item.requests)
end

private def parse_items(spider : Spider, parsed_item : ParsedItem)
Expand Down
55 changes: 34 additions & 21 deletions src/squirm/request_storage.cr
Original file line number Diff line number Diff line change
Expand Up @@ -11,48 +11,61 @@ module Squirm
getter requests : Synchronized(Hash(String, Array(Request))) = Synchronized(Hash(String, Array(Request))).new
getter history : Synchronized(Hash(String, Array(String))) = Synchronized(Hash(String, Array(String))).new

def store(spider : Spider, request : Request)
if @requests.has_key?(spider.id)
unless @history[spider.id].includes?(request.url)
@history[spider.id].push(request.url)
@requests[spider.id].push(request)
def store(id : String, request : Request)
if @requests.has_key?(id)
unless @history[id].includes?(request.url)
@history[id].push(request.url)
@requests[id].push(request)
end
else
@history[spider.id] = [request.url]
@requests[spider.id] = [request]
@history[id] = [request.url]
@requests[id] = [request]
end
end

def store(spider : Spider, requests : Array(Request))
def store(id : String, requests : Array(Request))
requests.each do |request|
store(spider, request)
store(id, request)
end
end

def store(spider : Spider, url : String)
store(spider, Request.new(:get, url))
def store(id : String, url : String)
store(id, Request.new(:get, url))
end

def store(spider : Spider, urls : Array(String))
def store(id : String, urls : Array(String))
urls.each do |url|
store(spider, url)
store(id, url)
end
end

def pop!(spider : Spider) : Request
@requests[spider.id].pop
def flush(id : String)
@requests[id] = [] of Request
end

def pop?(spider : Spider) : Request?
@requests[spider.id].pop?
def clear(id : String)
@requests[id] = [] of Request
@history[id] = [] of String
end

def flush(spider : Spider)
@requests[spider.id] = [] of Request
def pop!(id : String)
@requests[id].pop? || raise Exception.new("Request storage is empty")
end

def empty?(spider : Spider) : Bool
@requests[spider.id].empty?
def delete_history(id : String, url : String)
@history[id].delete(url)
end

def seen?(id : String, url : String) : Bool
@history[id].includes?(url)
end

def empty?(id : String) : Bool
@requests[id].empty?
end

def exists?(id : String) : Bool
@requests[id]? != nil || @history[id]? != nil
end
end
end
2 changes: 1 addition & 1 deletion src/squirm/spider.cr
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ module Squirm
abstract def parse_item(request : Request, response : Response) : ParsedItem
abstract def request_filters : Array(RequestFilters::Base)
abstract def response_filters : Array(ResponseFilters::Base)
abstract def timeout : Time::Span
abstract def request_timeout : Time::Span
abstract def concurrent_requests_per_domain : Int32
end
end

0 comments on commit c50a8b4

Please sign in to comment.