forked from joelgrus/hackernews
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape_api.rb
75 lines (56 loc) · 1.55 KB
/
scrape_api.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
# scrape new stories from the unofficial hacker news api
# the API is broken half the time, so this needs to be made more robust
require 'open-uri'
require 'json'
require_relative "story"
require_relative "utils"
require_relative "model"
base_url = "http://news.ycombinator.com/newest"
puts Time.now
def scrape(max_pages = 1, base_url)
new_stories = []
url = base_url
found_known_story = false
model = Model.load
(1..max_pages).each do |i|
puts "opening page #{i}: #{url}"
r = open(url).readline
puts "found #{r.size} characters"
doc = JSON.parse r
nextId = doc["nextId"]
doc["items"].each do |item|
itemid = item["id"]
if Story.where(:hnid => itemid).count > 0
found_known_story = true
puts "known story: #{itemid}"
else
story = Story.new
story.hnid = itemid
story.link_url = item["url"]
story.link_title = item["title"]
story.domain = domain(item["url"])
story.scraped_at = Time.now
story.user = item["postedBy"]
story.prediction = model.classify(story) if model
new_stories << story
end
end
break if found_known_story
break unless nextId
url = "#{base_url}/#{nextId}"
puts "moving ahead to #{url}"
end
puts "found #{new_stories.size} new stories"
new_stories.each do |s|
s.save
puts "new story:"
puts s.hnid
puts s.link_title
puts s.link_url
end
end
begin
scrape(20,"http://api.ihackernews.com/new")
rescue
puts "new failed, try again later"
end