-
Notifications
You must be signed in to change notification settings - Fork 0
/
ZameenScraper.py
141 lines (114 loc) · 5.51 KB
/
ZameenScraper.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# -*- coding: utf-8 -*-
"""
Created on Sun Mar 11 23:47:14 2018
@author: Umar Bin Khalid
"""
import requests
from bs4 import BeautifulSoup
"""
This module parses listings on 'beta.zameen.com'.
By 10 March 2018, it was tested on Lahore city on houses for sale and for rent.
Please, see the following functions to understand how it works:
__init__(self, ... )
getListings(self)
"""
class Zameen:
location_link = ""
price_min = 0
price_max = 0
purpose = 0
page = 1
def __init__(self, location_link = 'https://beta.zameen.com/Homes/Lahore_Johar_Town-93-1.html',
price_min = 0,
price_max = 0,
purpose = 0,
page = 1):
"""Constructor
Keyword arguments:
location_link -- This is the link which will be parsed however it might be changed due to following arguments.
price_min -- This argument will determine lower limit of price.
price_max -- This argument will determine upper limit of price.
purpose -- This will determine weather to parse '0: Buy/Sell' or '1: Rentals'.
page -- This will determine which page to parse for a particular area.
"""
self.location_link = location_link
self.price_min = price_min
self.price_max = price_max
self.purpose = purpose # Buy is 0, rent is 1.
self.page = page
def getLink(self):
linkBreakdown = self.location_link.split("-")
linkBreakdown[2] = str(self.page) + ".html"
new_location_link = self.unsplit(linkBreakdown, "-")
if self.purpose == 1:
linkBreakdown = self.location_link.split("/")
linkBreakdown[3] = "Rentals"
new_location_link = self.unsplit(linkBreakdown, "/")
if self.price_min > 0:
new_location_link += "?price_min=" + str(self.price_min)
if self.price_max > 0:
new_location_link += "&price_max=" + str(self.price_max)
elif self.price_max > 0:
new_location_link += "?price_max=" + str(self.price_max)
return new_location_link
def unsplit(self, strlist, splitarg):
strUnsplit = ""
for item in strlist:
strUnsplit += item + splitarg
return strUnsplit[0:-1]
def getHtmlDoc(self):
return requests.get(self.getLink()).text
def getSoup(self):
return BeautifulSoup(self.getHtmlDoc(), 'lxml')
def getListings(self):
"""This method will return parsed data of the desired area.
In case the page doesn't contain any data, the method will return following:
self.page -- Number of the page parse.
0 -- In place of total_pages.
None -- In place of listings dictionary.
The method returns following data:
current_page -- Page which was parsed.
total_pages -- Total number of pages that were parsed.
listings -- Dictionary in following format:
Key: Data index assigned by 'zameen.com' to the listing.
Value: Another dictionary containing parsed data with following keys:
link -- Link to listing on 'zameen.com'.
title -- Title of the listing.
location -- Location of the property.
price -- Price/Rent of the property.
area -- Area of the property.
description -- Description of the listing.
addedBy -- Date on which the listing was created and updated.
beds -- Number of beds (can be none)
"""
listings = {}
soup = self.getSoup()
try:
listings_list = soup.find('ul', attrs={'class': 'left search-list list-view'})
listing_items = listings_list.findAll('li', attrs={'class': 'listig-card-outter'})
except AttributeError:
print("No data on this page.")
return self.page, 0, None
for listing in listing_items:
single = {}
anchor = listing.find('a')
listing_dsc = anchor.find('div', attrs={'class': 'listing-card-dsc left'})
single['link'] = anchor['href'] # Link to listing.
single['title'] = listing_dsc.find('div', attrs={'class': 'title-wrap'}).div.text # Title.
single['location'] = listing_dsc.find('div', attrs={'class': 'location'}).text # Location of the property.
single['price'] = listing_dsc.div.findAll('span')[1].text # Price of the property.
single['area'] = listing_dsc.find('ul', attrs={'class': 'left slider_pinfo'}).find('li', attrs={'id': 'area'}).span.text # Area of the property.
single['description'] = listing_dsc.find('p', attrs={'class':'description left'}).text # Description of the property.
single['addedBy'] = listing_dsc.find('span', attrs={'class':'addedby'}).text # Date on which listing was created and updated.
# try: # Number of beds.
# single['beds'] = listing_dsc.find('ul', attrs={'class': 'left slider_pinfo'}).find('li', attrs={'id': 'bed'}).span.text
# except AttributeError:
# single['beds'] = None
listings[int(listing['data-index'])] = single
# page_info = soup.find('div', attrs={'class' : 'pg_counts'}).text
#
# current_page = int(page_info.split(" ")[1])
# total_pages = int(page_info.split(" ")[3].strip())
return listings
#return single['price']
#return single['link']