pyAarz.py

# -*- coding: utf-8 -*-
"""
Created on Tue Mar 13 19:54:39 2018

@author: Umar Bin Khalid
"""

import re
import requests
from bs4 import BeautifulSoup

class Aarz:
    location = ""
    price_min = 0
    price_max = 0
    purpose = 0
    page = 1

    def __init__(self, location = 'samanabad',
                 price_min = 0,
                 price_max = 0,
                 purpose = 0,
                 page = 1):
        """Constructor

        Keyword arguments:
            location -- Listings of the location will be parsed.
                Note: Use location name instead of link i.e. 'key' in dictionary generated by 'pyAraz.Locations'.
            price_min -- This argument will determine lower limit of price.
            price_max -- This argument will determine upper limit of price.
            purpose -- This will determine weather to parse '0: Buy/Sell' or '1: Rentals'.
            page -- This will determine which page to parse for a particular area.
        """
        self.location = location
        self.price_min = price_min
        self.price_max = price_max
        self.purpose = purpose # Buy is 0, rent is 1.
        self.page = page

    def getLink(self):
        location_link = "https://www.aarz.pk/search/page/" + str(self.page) + "?city_s=Lahore"

        loc_lp = "&loc1="
        price_min_lp = "&price_min="
        price_max_lp = "&price_max="
        purpose_lp = "&purpose="

        location_link += (loc_lp + self.location)

        if self.price_min > 0:
            location_link += (price_min_lp + str(self.price_min))

        if self.price_max > 0:
            location_link += (price_max_lp + str(self.price_max))

        if self.purpose == 0:
            location_link += purpose_lp + "Sell"
        else:
            location_link += purpose_lp + "Rent"

        return location_link

    def getHtmlDoc(self):
            return requests.get(self.getLink()).text

    def getSoup(self):
        return BeautifulSoup(self.getHtmlDoc(), 'lxml')

    def getListings(self):
        """This method will return parsed data of the desired area.

        In case the page doesn't contain any data, the method will return following:
            self.page -- Number of the page parse.
            0 -- In place of total_pages.
            None -- In place of listings dictionary.

        In case there is only one page in total, the method will return following:
            1 -- Number of the page parse.
            1 -- In place of total_pages.
            listings -- listings dictionary.

        The method returns following data:
            current_page -- Page which was parsed.
            total_pages -- Total number of pages that were parsed.
            listings -- Dictionary in following format:
                Key: Data index assigned by 'zameen.com' to the listing.
                Value: Another dictionary containing parsed data with following keys:
                    link -- Link to listing on 'zameen.com'.
                    title -- Title of the listing.
                    price -- Price/Rent of the property.
                    area -- Area of the property.
                    address -- Address of the property.
                    beds -- Number of beds (can be none)
                    baths -- Number of beds (can be none)
                    description -- Description of the listing.
                    addedOn -- Date on which the listing was created and updated.
        """
        listings = {}

        soup = self.getSoup()

        try:
            item_count = int(soup.find('div', attrs={'class': 'show_items'}).p.text.split(" ")[3])
        except AttributeError:
            return self.page, 0, None

        for listing in soup.findAll('div', attrs={'class': 'property-listing row'}):
            single = {}

            single['link'] = "https://www.aarz.pk" + listing.h2.a['href']
            single['title'] = listing.h2.a.text
            single['price'] = listing.h2.h4.text

            regex_compiled = re.compile(r"\d*\s(Kanal|Marla)", re.I)

            match = regex_compiled.search(single['title'])

            if match is not None:
                single['area'] = match.group(0)
            else:
                single['area'] = None

            info_div = listing.find('div', attrs={'class': 'col-md-5 col-sm-4 col-xs-12'})

            single['address'] = info_div.h4.a.text

            bedsnbath = info_div.find('div', attrs={'class': 'property-features'}).text.strip().split(" ")

            # single['beds'] = int(bedsnbath[1].strip())
            # single['bath'] = int(bedsnbath[-1])

            try:
                single['description'] = info_div.find('div', attrs={'class': 'text-muted property-desc'}).p.text.strip()
            except AttributeError:
                try:
                    single['description'] = info_div.find('div', attrs={'class': 'text-muted property-desc'}).p.text.strip()
                except AttributeError:
                    single['description'] = None

            added_info = info_div.find('div', attrs={'class': 'text-muted property-side-info'}).text.strip().split(" ")

            single['addedOn'] = added_info[0] + " " + added_info[1] + " " + added_info[2]


            listings[item_count] = single
            item_count += 1

        try:
            page_info = soup.find('p', attrs={'class': 'text-muted text-center'}).text
        except AttributeError:
            return listings

        # current_page = int(page_info.strip().split(" ")[1])
        # total_pages = int(page_info.strip().split(" ")[-1])

        return listings