Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion lib/retriever/fetch.rb
Original file line number Diff line number Diff line change
Expand Up @@ -232,7 +232,7 @@ def new_visitable_links(current_page)

def push_custom_to_result(url, current_page, &block)
data = block.call current_page
@result.push(data) unless data.empty?
@result.push(data) unless data.nil? || data.empty?
lg("-- PageIterator called on: #{url}")
end

Expand Down
4 changes: 4 additions & 0 deletions lib/retriever/page.rb
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,10 @@ def initialize(url, source, t)
@links = nil
end

def headers
@t.headers
end

# receives page source as string
# returns array of unique href links
def links
Expand Down
21 changes: 14 additions & 7 deletions lib/retriever/target.rb
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ module Retriever
class Target
HTTP_RE = Regexp.new(/^http/i).freeze

attr_reader :host, :target, :host_re, :source, :file_re, :scheme, :port
attr_reader :host, :target, :host_re, :source, :file_re, :scheme, :port, :headers

def initialize(url, file_re = nil)
fail 'Bad URL' unless url.include?('.')
Expand All @@ -18,6 +18,7 @@ def initialize(url, file_re = nil)
@file_re ||= file_re
@scheme = target_uri.scheme
@port = target_uri.port
@headers = {}
end

def source
Expand All @@ -28,7 +29,10 @@ def source
# if redirect URL is same host, we want to re-sync @target
return resync_target_and_return_source(resp_url)
end
resp = resp.read

@headers = resp.meta
resp = resp.read

#
fail 'Domain is not working. Try the non-WWW version.' if resp == ''
fail 'Domain not working. Try HTTPS???' unless resp
Expand All @@ -37,11 +41,14 @@ def source
end

def resync_target_and_return_source(url)
new_t = Retriever::Target.new(url)
@target = new_t.target
@host = new_t.host
@scheme = new_t.scheme
new_t.source
new_t = Retriever::Target.new(url)
@target = new_t.target
@host = new_t.host
@scheme = new_t.scheme
source = new_t.source
@headers = new_t.headers

source
end
end
end
1 change: 1 addition & 0 deletions readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,7 @@ puts t.result.to_s
Available methods on the page iterator:
* **#url** - returns full URL of current page
* **#source** - returns raw page source code
* **#headers** - returns the page's response headers
* **#title** - returns html decoded verson of curent page title
* **#desc** - returns html decoded verson of curent page meta description
* **#h1** - returns html decoded verson of current page's h1 tag
Expand Down
2 changes: 1 addition & 1 deletion rubyretriever.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ $LOAD_PATH.unshift(lib) unless $LOAD_PATH.include?(lib)
require 'retriever/version'

Gem::Specification.new do |s|
s.required_ruby_version = ['>= 2.0', '< 2.3']
s.required_ruby_version = ['>= 2.0', '<= 2.8']
s.platform = Gem::Platform::RUBY
s.version = Retriever::VERSION
s.name = 'rubyretriever'
Expand Down
7 changes: 7 additions & 0 deletions spec/page_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,13 @@
end
end

describe '#headers' do
let(:page) { Retriever::Page.new('http://www.cnet.com/', t.source, t) }
it 'returns current page response HEADERS' do
expect(page.headers).to_not be_empty
end
end

describe '#links' do
let(:source) { "<a href='/profile/'>profile</a><a href='#top'>top</a> <link rel='stylesheet' id='gforms_reset_css-css' href='http://www.cnet.com/wp-content/plugins/gravityforms/css/formreset.css?ver=1.7.12' type='text/css' media='all' />" }
let(:page) { Retriever::Page.new('http://www.cnet.com/', source, t) }
Expand Down
24 changes: 24 additions & 0 deletions spec/target_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -52,4 +52,28 @@
expect { redirecting_url }.to raise_error
end
end

describe '#source' do
let(:redirecting_url) do
Retriever::Target.new('http://software-by-joe.appspot.com').source
end

it 'opens URL and returns source as String' do
expect(Retriever::Target.new('http://techcrunch.com/').source.class)
.to eq(String)
end

it 'fails if target redirects to new host' do
expect { redirecting_url }.to raise_error
end
end

describe '#headers' do
let(:target) {Retriever::Target.new('http://techcrunch.com/')}
it 'opens the URL and sets the headers' do
expect(target.source).to_not be_empty
expect(target.headers).to_not be_empty
end
end

end