This repository has been archived by the owner on Aug 26, 2019. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathciti_scraper.rb
85 lines (71 loc) · 2.43 KB
/
citi_scraper.rb
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
# Name: citi_scraper.rb
# Description:
# Author: Bob Gardner
# Date: 4/28/14
# License: MIT
require_relative 'bike_trip'
require_relative 'exceptions'
require 'mechanize'
# Scrape the Citibike website
class CitiScraper
LOGIN_URL = 'https://citibikenyc.com/login'
TRIPS_URL = 'https://citibikenyc.com/member/trips'
LOGIN_PAGE_TITLE = 'Login | Citi Bike'
TRIPS_PAGE_TITLE = 'Trips | Citi Bike'
MIN_TRIP_DURATION = 2 # in minutes
attr_accessor :username, :password
attr_reader :valid_credentials
# initialize variables and login
def initialize(username, password)
@agent = Mechanize.new
@valid_credentials = false
login(username, password)
end
def login(username = @username, password = @password)
@agent.get(LOGIN_URL)
@agent.page.forms[0]['subscriberUsername'] = username
@agent.page.forms[0]['subscriberPassword'] = password
@agent.page.forms[0].submit
if @agent.page.title == LOGIN_PAGE_TITLE
fail Exceptions::CitiBikeWebsiteError if @valid_credentials
fail Exceptions::LoginError, 'Invalid username or password.'
end
@valid_credentials = true
@password = password
@username = username
rescue Mechanize::ResponseCodeError => e
handle_error(e)
raise Exceptions::CitiBikeWebsiteError
end
# Returns this month's trips.
def trips
@agent.get(TRIPS_URL)
# If session expires, re-login to citibikenyc.com. The site will redirect
# back to TRIPS_URL upon sign in (friendly forwarding)
login unless @agent.page.title == TRIPS_PAGE_TITLE
rows = Nokogiri::HTML(@agent.page.body).xpath('//table/tbody/tr')
# Reject bike trips that are either in progress or have durations <
# MIN_TRIP_DURATION minutes.
rows = rows.reject do |row|
duration = row.at_xpath('td[6]/text()').to_s.match(/(\d{1,2})m/)
!duration || (duration.captures[0].to_i < MIN_TRIP_DURATION)
end
rows.map { |row| row_to_trip(row) }
end
private
# Handle Citi Bike Website errors (e.x. Net::HTTPGatewayTimeOut)
def handle_error(error)
puts error.message
puts error.backtrace.join("\n")
end
# Convert HTML row into bike trip object
def row_to_trip(row)
trip = BikeTrip.new
trip_attributes = [:id, :start_location, :start_time, :end_location,
:end_time, :duration]
trip_attributes.each_with_index do |name, i|
trip.send("#{name}=", row.at_xpath("td[#{i + 1}]/text()").to_s.strip)
end
trip
end
end