diff --git a/lib/retriever/fetch.rb b/lib/retriever/fetch.rb index 184b7b8..3c61f93 100644 --- a/lib/retriever/fetch.rb +++ b/lib/retriever/fetch.rb @@ -232,7 +232,7 @@ def new_visitable_links(current_page) def push_custom_to_result(url, current_page, &block) data = block.call current_page - @result.push(data) unless data.empty? + @result.push(data) unless data.nil? || data.empty? lg("-- PageIterator called on: #{url}") end diff --git a/lib/retriever/page.rb b/lib/retriever/page.rb index abc0da9..98daa86 100644 --- a/lib/retriever/page.rb +++ b/lib/retriever/page.rb @@ -42,6 +42,10 @@ def initialize(url, source, t) @links = nil end + def headers + @t.headers + end + # receives page source as string # returns array of unique href links def links diff --git a/lib/retriever/target.rb b/lib/retriever/target.rb index 138ab69..f6e51c7 100644 --- a/lib/retriever/target.rb +++ b/lib/retriever/target.rb @@ -6,7 +6,7 @@ module Retriever class Target HTTP_RE = Regexp.new(/^http/i).freeze - attr_reader :host, :target, :host_re, :source, :file_re, :scheme, :port + attr_reader :host, :target, :host_re, :source, :file_re, :scheme, :port, :headers def initialize(url, file_re = nil) fail 'Bad URL' unless url.include?('.') @@ -18,6 +18,7 @@ def initialize(url, file_re = nil) @file_re ||= file_re @scheme = target_uri.scheme @port = target_uri.port + @headers = {} end def source @@ -28,7 +29,10 @@ def source # if redirect URL is same host, we want to re-sync @target return resync_target_and_return_source(resp_url) end - resp = resp.read + + @headers = resp.meta + resp = resp.read + # fail 'Domain is not working. Try the non-WWW version.' if resp == '' fail 'Domain not working. Try HTTPS???' unless resp @@ -37,11 +41,14 @@ def source end def resync_target_and_return_source(url) - new_t = Retriever::Target.new(url) - @target = new_t.target - @host = new_t.host - @scheme = new_t.scheme - new_t.source + new_t = Retriever::Target.new(url) + @target = new_t.target + @host = new_t.host + @scheme = new_t.scheme + source = new_t.source + @headers = new_t.headers + + source end end end diff --git a/readme.md b/readme.md index 84e8c4e..c08d184 100644 --- a/readme.md +++ b/readme.md @@ -140,6 +140,7 @@ puts t.result.to_s Available methods on the page iterator: * **#url** - returns full URL of current page * **#source** - returns raw page source code +* **#headers** - returns the page's response headers * **#title** - returns html decoded verson of curent page title * **#desc** - returns html decoded verson of curent page meta description * **#h1** - returns html decoded verson of current page's h1 tag diff --git a/rubyretriever.gemspec b/rubyretriever.gemspec index 404a01a..30b7527 100644 --- a/rubyretriever.gemspec +++ b/rubyretriever.gemspec @@ -4,7 +4,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib) require 'retriever/version' Gem::Specification.new do |s| - s.required_ruby_version = ['>= 2.0', '< 2.3'] + s.required_ruby_version = ['>= 2.0', '<= 2.8'] s.platform = Gem::Platform::RUBY s.version = Retriever::VERSION s.name = 'rubyretriever' diff --git a/spec/page_spec.rb b/spec/page_spec.rb index 2e61f99..76dbe98 100644 --- a/spec/page_spec.rb +++ b/spec/page_spec.rb @@ -26,6 +26,13 @@ end end + describe '#headers' do + let(:page) { Retriever::Page.new('http://www.cnet.com/', t.source, t) } + it 'returns current page response HEADERS' do + expect(page.headers).to_not be_empty + end + end + describe '#links' do let(:source) { "profiletop " } let(:page) { Retriever::Page.new('http://www.cnet.com/', source, t) } diff --git a/spec/target_spec.rb b/spec/target_spec.rb index 4ec7732..e42083d 100644 --- a/spec/target_spec.rb +++ b/spec/target_spec.rb @@ -52,4 +52,28 @@ expect { redirecting_url }.to raise_error end end + + describe '#source' do + let(:redirecting_url) do + Retriever::Target.new('http://software-by-joe.appspot.com').source + end + + it 'opens URL and returns source as String' do + expect(Retriever::Target.new('http://techcrunch.com/').source.class) + .to eq(String) + end + + it 'fails if target redirects to new host' do + expect { redirecting_url }.to raise_error + end + end + + describe '#headers' do + let(:target) {Retriever::Target.new('http://techcrunch.com/')} + it 'opens the URL and sets the headers' do + expect(target.source).to_not be_empty + expect(target.headers).to_not be_empty + end + end + end