Scraping.py

# -*- coding: utf-8 -*-
"""doctor.ipynb

Automatically generated by Colaboratory.

Original file is located at
    https://colab.research.google.com/drive/137GWgclDBfLC8NuPI2qlDOAJ6KpXTCgI
"""

import requests
import numpy as np
from bs4 import BeautifulSoup

headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/42.0.2311.135 Safari/537.36 Edge/12.246'}

# No of city list
city=['Delhi','Mumbai','Bangalore']

# create empty lists for each variable
names = []
degree_types = []
dp_scores = []
npv_values = []
locations = []
cities = []
consult_fees = []
years_of_experience = []
specialities = []


for i in city:

  url = f'https://www.practo.com/{i}/doctors?page='
  # create a session object
  session = requests.Session()

  # make the initial request using the session object
  response = session.get(url, headers=headers)
  soup = BeautifulSoup(response.content, 'html5lib')

  # get the total number of pages to scrape
  total_results = int(soup.find('h1', {'class': 'u-xx-large-font u-bold'}).text.split()[0])
  total_pages = round(total_results / 10)

  # loop through the pages and scrape the data
  doctors_urls = ['https://www.practo.com' + link['href']
                  for page in range(total_pages)
                  for link in BeautifulSoup(session.get(f'https://www.practo.com/{i}/doctors?page={page}',
                                                        headers=headers).content, 'html5lib').find_all('a', href=True)
                  if 'doctor' in link['href'] and 'practice_id' in link['href'] and 'recommended' not in link['href']]

  # remove duplicates from the list
  unique_doctors_urls = list(set(doctors_urls))

  # loop through unique doctors urls and scrape data
  for url in unique_doctors_urls:
      try:
          soup1 = BeautifulSoup(session.get(url, headers=headers).content, 'html5lib')
      except Exception as e:
          print(f'error in soup1 {e}:{url}')
          
          
      try:
          name = soup1.find('h1', {'class': 'c-profile__title u-bold u-d-inlineblock'}).text
          names.append(name)
      except Exception as e:
          names.append(np.NaN)
          print(f'error in names {e}:{url}')
          
      try:
          degree_type = soup1.find('p', {'class': 'c-profile__details'}).text
          degree_types.append(degree_type)
      except Exception as e:
          degree_types.append(np.NaN)
          print(f'error in degree_types {e}:{url}')
          
          
      try:
          dp_score = soup1.find('span', {'class': 'u-green-text u-bold u-large-font'}).text
          dp_scores.append(dp_score)
      except Exception as e:
          dp_scores.append(np.NaN)
          print(f'error in dp_scores {e}:{url}')
          
          
      try:
          npv_value = soup1.find('span', {'class': 'u-smallest-font u-grey_3-text'}).text
          npv_values.append(npv_value)
      except Exception as e:
          npv_values.append(np.NaN)
          print(f'error in npv_values {e}:{url}')
          
          
      try:
          location = soup1.find('h4', {'class': 'c-profile--clinic__location'}).text.split(',')[0]
          locations.append(location)
      except Exception as e:
          locations.append(np.NaN)
          print(f'error in locations {e}:{url}')
          
          
      try:
          city = soup1.find('h4', {'class': 'c-profile--clinic__location'}).text.split(',')[1]
          cities.append(city)
      except Exception as e:
          cities.append(np.NaN)
          print(f'error in cities {e}:{url}')
      try:
          consult_fee = soup1.find('span', {'data-qa-id': 'consultation_fee'}).text
          consult_fees.append(consult_fee)
          
      except Exception as e:
          consult_fees.append(np.NaN)
          print(f'error in consult_fees {e}:{url}')
          
      try:
          year_of_experience = soup1.find('div', {'class': 'c-profile__details'}).text.split('\xa0')[0][-2:]
          years_of_experience.append(year_of_experience)
          
      except Exception as e:
          years_of_experience.append(np.NaN)
          print(f'error in years_of_experience {e}:{url}')
          
      try:
          speciality = soup1.find('div', {'class': 'u-d-inline-flex flex-ai-center'}).text
          specialities.append(speciality)
          
      except Exception as e:
          specialities.append(np.NaN)
          print(f'error in specialities {e}:{url}')

import pandas as pd

print(len(names))
print(len(degree_types))
print(len(dp_scores))
print(len(npv_values))
print(len(locations))
print(len(cities))
print(len(consult_fees))
print(len(years_of_experience))
print(len(specialities))

data = {
    'Name': names,
    'Degree': degree_types,
    'DP Score': dp_scores,
    'NPV Value': npv_values,
    'Location': locations,
    'City': cities,
    'Consult Fee': consult_fees,
    'Years of Experience': years_of_experience,
    'Speciality': specialities
}

df = pd.DataFrame(data)

# Print the DataFrame
df.to_csv('doctor.csv', index=False)
df.isnull().sum()