-
Notifications
You must be signed in to change notification settings - Fork 0
/
London_Spider.py
62 lines (47 loc) · 2.33 KB
/
London_Spider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
import re
import scrapy
from scrapy import Request
from scrapy.loader import ItemLoader
from itemloaders.processors import Join
page_count = 0
class Property(scrapy.Item):
title = scrapy.Field(output_processor=Join())
price = scrapy.Field(output_processor=Join())
url = scrapy.Field(output_processor=Join())
class LondonrelocationSpider(scrapy.Spider):
name = 'londonrelocation'
allowed_domains = ['londonrelocation.com']
start_urls = ['https://londonrelocation.com/properties-to-rent/']
def parse(self, response):
for start_url in self.start_urls:
yield Request(start_url,
callback=self.parse_area)
def parse_area(self, response):
area_urls = response.xpath('.//div[contains(@class,"area-box-pdh")]//h4/a/@href').extract()
for area_url in area_urls:
yield Request(url=area_url,callback=self.parse_area_pages)
def parse_area_pages(self, response):
global page_count
titles = response.xpath("//div[@class= 'right-cont']/div/h4/a/text()").extract()
prices = response.xpath("//div[@class= 'right-cont']/div[3]/h5/text()").extract()
urls = response.xpath("//div[@class= 'right-cont']/div/h4/a/@href").extract()
for (title,price,url) in zip(titles,prices,urls):
factor = 4 if 'pw' in price else 1
price = str(int(re.sub("[^0-9]","",price))*factor)
property = ItemLoader(item=Property())
property.add_value('title', title)
property.add_value('price', price)
property.add_value('url', 'londonrelocation.com'+url)
yield property.load_item()
# get first 2 pages only
page_count +=1
if page_count>2:
page_count =0
return
pagination =response.xpath("//div[@class='pagination']/ul/li").extract()
if len(pagination)>3:
index = next(i for i,x in enumerate(pagination) if '<a' not in x) # first element with no children
next_page = response.xpath("//div[@class='pagination']/ul/li/a/@href").extract()[index]
if next_page:
next_page = response.urljoin(next_page)
yield scrapy.Request(next_page,callback=self.parse_area_pages)