-
Notifications
You must be signed in to change notification settings - Fork 3
/
rss_scraper.py
76 lines (64 loc) · 2.26 KB
/
rss_scraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
from bs4 import BeautifulSoup
import requests
import re
import feedparser
rss_generic_free_link = "http://%s.craigslist.org/search/zip?query=%s&format=rss"
SUMMARY_LIMIT = 150
def get_rss(site, term=""):
rss_link = rss_generic_free_link % (site, term)
return feedparser.parse(rss_link)
def parse_item(item):
result = {}
result['url'] = item['dc_source']
result['title'] = item['title']
result['summary'] = item['summary']
result['picture'] = None
if item.get('enc_enclosure'):
result['picture'] = item['enc_enclosure'].get('resource')
return result
def parse_item_list(rss):
return map(parse_item, rss.entries)
def parse_rss_feed(site, term=""):
rss = get_rss(site, term)
return parse_item_list(rss)
# Get the html for a specific webpage
def getItemHTML( url ):
resp = requests.get(url)
data = resp.text
return data
# Parse the url with BeautifulSoup and return the relevant data
def getItemData( url ):
data = getItemHTML( url )
soup = BeautifulSoup(data) #if theres nothing in the array no pic
mapwrap = soup.find_all(href=re.compile("maps.google.com")) #if theres nothing in the array, no map
if(len(mapwrap) > 0):
mapwrap = mapwrap[0].get('href')
else:
mapwrap = None
mapDiv = soup.find(id="map")
coordinates = {}
if mapDiv != None:
lat = mapDiv.get("data-latitude")
lng = mapDiv.get("data-longitude")
coordinates['lat'] = lat
coordinates['lng'] = lng
replylink = soup.find(id="replylink")
emailRequest = requests.get("http://norfolk.craigslist.com" + replylink.get('href'))
emailRequestData = emailRequest.text
emailSoup = BeautifulSoup(emailRequestData)
replyEmail = emailSoup.find_all(class_="anonemail")
if len(replyEmail) > 0:
replyEmail = replyEmail[0].text
addressArray = soup.findAll("div", { "class" : "mapaddress" })
if len(addressArray) > 0:
address = addressArray[0].text
else:
address = None
returnData = { }
returnData['map'] = mapwrap
returnData['replyemail'] = replyEmail
returnData['address'] = address
returnData['coordinates'] = coordinates
return returnData
if __name__ == "__main__":
print(getItemData('http://norfolk.craigslist.org/zip/4889818045.html'))