diff --git a/lib/retriever/fetch.rb b/lib/retriever/fetch.rb
index 184b7b8..3c61f93 100644
--- a/lib/retriever/fetch.rb
+++ b/lib/retriever/fetch.rb
@@ -232,7 +232,7 @@ def new_visitable_links(current_page)
def push_custom_to_result(url, current_page, &block)
data = block.call current_page
- @result.push(data) unless data.empty?
+ @result.push(data) unless data.nil? || data.empty?
lg("-- PageIterator called on: #{url}")
end
diff --git a/lib/retriever/page.rb b/lib/retriever/page.rb
index abc0da9..98daa86 100644
--- a/lib/retriever/page.rb
+++ b/lib/retriever/page.rb
@@ -42,6 +42,10 @@ def initialize(url, source, t)
@links = nil
end
+ def headers
+ @t.headers
+ end
+
# receives page source as string
# returns array of unique href links
def links
diff --git a/lib/retriever/target.rb b/lib/retriever/target.rb
index 138ab69..f6e51c7 100644
--- a/lib/retriever/target.rb
+++ b/lib/retriever/target.rb
@@ -6,7 +6,7 @@ module Retriever
class Target
HTTP_RE = Regexp.new(/^http/i).freeze
- attr_reader :host, :target, :host_re, :source, :file_re, :scheme, :port
+ attr_reader :host, :target, :host_re, :source, :file_re, :scheme, :port, :headers
def initialize(url, file_re = nil)
fail 'Bad URL' unless url.include?('.')
@@ -18,6 +18,7 @@ def initialize(url, file_re = nil)
@file_re ||= file_re
@scheme = target_uri.scheme
@port = target_uri.port
+ @headers = {}
end
def source
@@ -28,7 +29,10 @@ def source
# if redirect URL is same host, we want to re-sync @target
return resync_target_and_return_source(resp_url)
end
- resp = resp.read
+
+ @headers = resp.meta
+ resp = resp.read
+
#
fail 'Domain is not working. Try the non-WWW version.' if resp == ''
fail 'Domain not working. Try HTTPS???' unless resp
@@ -37,11 +41,14 @@ def source
end
def resync_target_and_return_source(url)
- new_t = Retriever::Target.new(url)
- @target = new_t.target
- @host = new_t.host
- @scheme = new_t.scheme
- new_t.source
+ new_t = Retriever::Target.new(url)
+ @target = new_t.target
+ @host = new_t.host
+ @scheme = new_t.scheme
+ source = new_t.source
+ @headers = new_t.headers
+
+ source
end
end
end
diff --git a/readme.md b/readme.md
index 84e8c4e..c08d184 100644
--- a/readme.md
+++ b/readme.md
@@ -140,6 +140,7 @@ puts t.result.to_s
Available methods on the page iterator:
* **#url** - returns full URL of current page
* **#source** - returns raw page source code
+* **#headers** - returns the page's response headers
* **#title** - returns html decoded verson of curent page title
* **#desc** - returns html decoded verson of curent page meta description
* **#h1** - returns html decoded verson of current page's h1 tag
diff --git a/rubyretriever.gemspec b/rubyretriever.gemspec
index 404a01a..30b7527 100644
--- a/rubyretriever.gemspec
+++ b/rubyretriever.gemspec
@@ -4,7 +4,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
require 'retriever/version'
Gem::Specification.new do |s|
- s.required_ruby_version = ['>= 2.0', '< 2.3']
+ s.required_ruby_version = ['>= 2.0', '<= 2.8']
s.platform = Gem::Platform::RUBY
s.version = Retriever::VERSION
s.name = 'rubyretriever'
diff --git a/spec/page_spec.rb b/spec/page_spec.rb
index 2e61f99..76dbe98 100644
--- a/spec/page_spec.rb
+++ b/spec/page_spec.rb
@@ -26,6 +26,13 @@
end
end
+ describe '#headers' do
+ let(:page) { Retriever::Page.new('http://www.cnet.com/', t.source, t) }
+ it 'returns current page response HEADERS' do
+ expect(page.headers).to_not be_empty
+ end
+ end
+
describe '#links' do
let(:source) { "profiletop " }
let(:page) { Retriever::Page.new('http://www.cnet.com/', source, t) }
diff --git a/spec/target_spec.rb b/spec/target_spec.rb
index 4ec7732..e42083d 100644
--- a/spec/target_spec.rb
+++ b/spec/target_spec.rb
@@ -52,4 +52,28 @@
expect { redirecting_url }.to raise_error
end
end
+
+ describe '#source' do
+ let(:redirecting_url) do
+ Retriever::Target.new('http://software-by-joe.appspot.com').source
+ end
+
+ it 'opens URL and returns source as String' do
+ expect(Retriever::Target.new('http://techcrunch.com/').source.class)
+ .to eq(String)
+ end
+
+ it 'fails if target redirects to new host' do
+ expect { redirecting_url }.to raise_error
+ end
+ end
+
+ describe '#headers' do
+ let(:target) {Retriever::Target.new('http://techcrunch.com/')}
+ it 'opens the URL and sets the headers' do
+ expect(target.source).to_not be_empty
+ expect(target.headers).to_not be_empty
+ end
+ end
+
end