-
Notifications
You must be signed in to change notification settings - Fork 0
/
pyAarz.py
156 lines (119 loc) · 5.48 KB
/
pyAarz.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
# -*- coding: utf-8 -*-
"""
Created on Tue Mar 13 19:54:39 2018
@author: Umar Bin Khalid
"""
import re
import requests
from bs4 import BeautifulSoup
class Aarz:
location = ""
price_min = 0
price_max = 0
purpose = 0
page = 1
def __init__(self, location = 'samanabad',
price_min = 0,
price_max = 0,
purpose = 0,
page = 1):
"""Constructor
Keyword arguments:
location -- Listings of the location will be parsed.
Note: Use location name instead of link i.e. 'key' in dictionary generated by 'pyAraz.Locations'.
price_min -- This argument will determine lower limit of price.
price_max -- This argument will determine upper limit of price.
purpose -- This will determine weather to parse '0: Buy/Sell' or '1: Rentals'.
page -- This will determine which page to parse for a particular area.
"""
self.location = location
self.price_min = price_min
self.price_max = price_max
self.purpose = purpose # Buy is 0, rent is 1.
self.page = page
def getLink(self):
location_link = "https://www.aarz.pk/search/page/" + str(self.page) + "?city_s=Lahore"
loc_lp = "&loc1="
price_min_lp = "&price_min="
price_max_lp = "&price_max="
purpose_lp = "&purpose="
location_link += (loc_lp + self.location)
if self.price_min > 0:
location_link += (price_min_lp + str(self.price_min))
if self.price_max > 0:
location_link += (price_max_lp + str(self.price_max))
if self.purpose == 0:
location_link += purpose_lp + "Sell"
else:
location_link += purpose_lp + "Rent"
return location_link
def getHtmlDoc(self):
return requests.get(self.getLink()).text
def getSoup(self):
return BeautifulSoup(self.getHtmlDoc(), 'lxml')
def getListings(self):
"""This method will return parsed data of the desired area.
In case the page doesn't contain any data, the method will return following:
self.page -- Number of the page parse.
0 -- In place of total_pages.
None -- In place of listings dictionary.
In case there is only one page in total, the method will return following:
1 -- Number of the page parse.
1 -- In place of total_pages.
listings -- listings dictionary.
The method returns following data:
current_page -- Page which was parsed.
total_pages -- Total number of pages that were parsed.
listings -- Dictionary in following format:
Key: Data index assigned by 'zameen.com' to the listing.
Value: Another dictionary containing parsed data with following keys:
link -- Link to listing on 'zameen.com'.
title -- Title of the listing.
price -- Price/Rent of the property.
area -- Area of the property.
address -- Address of the property.
beds -- Number of beds (can be none)
baths -- Number of beds (can be none)
description -- Description of the listing.
addedOn -- Date on which the listing was created and updated.
"""
listings = {}
soup = self.getSoup()
try:
item_count = int(soup.find('div', attrs={'class': 'show_items'}).p.text.split(" ")[3])
except AttributeError:
return self.page, 0, None
for listing in soup.findAll('div', attrs={'class': 'property-listing row'}):
single = {}
single['link'] = "https://www.aarz.pk" + listing.h2.a['href']
single['title'] = listing.h2.a.text
single['price'] = listing.h2.h4.text
regex_compiled = re.compile(r"\d*\s(Kanal|Marla)", re.I)
match = regex_compiled.search(single['title'])
if match is not None:
single['area'] = match.group(0)
else:
single['area'] = None
info_div = listing.find('div', attrs={'class': 'col-md-5 col-sm-4 col-xs-12'})
single['address'] = info_div.h4.a.text
bedsnbath = info_div.find('div', attrs={'class': 'property-features'}).text.strip().split(" ")
# single['beds'] = int(bedsnbath[1].strip())
# single['bath'] = int(bedsnbath[-1])
try:
single['description'] = info_div.find('div', attrs={'class': 'text-muted property-desc'}).p.text.strip()
except AttributeError:
try:
single['description'] = info_div.find('div', attrs={'class': 'text-muted property-desc'}).p.text.strip()
except AttributeError:
single['description'] = None
added_info = info_div.find('div', attrs={'class': 'text-muted property-side-info'}).text.strip().split(" ")
single['addedOn'] = added_info[0] + " " + added_info[1] + " " + added_info[2]
listings[item_count] = single
item_count += 1
try:
page_info = soup.find('p', attrs={'class': 'text-muted text-center'}).text
except AttributeError:
return listings
# current_page = int(page_info.strip().split(" ")[1])
# total_pages = int(page_info.strip().split(" ")[-1])
return listings