-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathextract_data.py
167 lines (135 loc) · 5.86 KB
/
extract_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
import argparse
import re
import time
import requests
import pandas as pd
from bs4 import BeautifulSoup
parser = argparse.ArgumentParser()
parser.add_argument('--input_airline', required=True,
help='name of airline to scrape')
parser.add_argument('--output_data', required=True,
help='how to name the scrapped csv file')
parser.add_argument('--input_page_size', required=False,
help='the size of reviews to extract from a single page')
parser.add_argument('--input_sleep_time', required=True,
help='time (seconds) taken before scrapping the next page')
args = parser.parse_args()
input_airline = args.input_airline
output_data = args.output_data
input_page_size = args.input_page_size
input_sleep_time = args.input_sleep_time
data_list = []
user_agent = "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36"
base_url ="https://www.airlinequality.com/airline-reviews"
def get_soup(page_count, page_size=20):
headers = {
"User-Agent": user_agent}
url = f"{base_url}/{input_airline}/page/{page_count}/?sortby=post_date%3ADesc&pagesize={page_size}"
response = requests.get(url, headers)
if response.status_code == 200:
soup = BeautifulSoup(response.content, 'lxml')
cleaned_soup = soup.find_all("article", itemprop="review")
return cleaned_soup
else:
print("Error in getting soup")
def extract_raw_data(cleaned_soup):
# extract the data from the soup
# articles = soup.find_all("article", itemprop="review")
for article in cleaned_soup:
######## DATA OUTSIDE TABLE ########
# review publication date
try:
date_published = article.find("time", itemprop="datePublished").get('datetime')
except AttributeError:
date_published = 'NA'
# review title
try:
title = article.find("h2", class_="text_header").text
title = re.sub(r'[“”"]', '', title)
except AttributeError:
title = 'NA'
# is trip verified or not?
try:
trip_verification = article.find("div", class_="text_content", itemprop="reviewBody")
string_pattern = r'(Trip Verified|Not Verified|Verified Review)'
trip_verification = re.search(string_pattern, trip_verification.text).group(1)
except AttributeError:
trip_verification = 'NA'
# passenger's country
try:
country = article.find("h3", class_="text_sub_header userStatusWrapper")
country = re.search(r'\((.*?)\)', country.text).group(1)
except AttributeError:
country = 'NA'
# passenger's trip review
try:
review = article.find("div", class_="text_content", itemprop="reviewBody").text
if review != 'NA':
review_parts = review.split('|', 1)
review = review_parts[1].strip() if len(review_parts) > 1 else review_parts
except AttributeError:
review = 'NA'
# passenger's other reviews
try:
other_reviews = article.find("span", class_="userStatusReviewCount").text
other_reviews = int(re.search(r'\d+', other_reviews).group(0))
except AttributeError:
other_reviews = 'NA'
# passenger's rating out of 10
try:
rating_10 = article.find("span", itemprop="ratingValue").get_text(strip=True)
except AttributeError:
rating_10 = 'NA'
general_dict = {
'date_published': date_published,
'summary_title': title,
'country': country,
'trip_verified': trip_verification,
'review': review,
'other_reviews': other_reviews,
'ratings_10': rating_10,
}
######## DATA INSIDE TABLE ########
# other passenger details provided in table
passenger_details = ["aircraft", "type_of_traveller", "cabin_flown", "route", "recommended"]
passenger_details_dict = {}
for detail in passenger_details:
passenger_detail = article.find("td", class_="review-rating-header " + detail)
try:
passenger_detail = passenger_detail \
.find_next('td', class_="review-value") if passenger_detail else 'NA'
passenger_details_dict[detail] = passenger_detail.get_text(strip=True) \
if passenger_details else 'NA'
except AttributeError:
pass
# passenger ratings about other issues
ratings_dict = {}
services_ratings = ['seat_comfort', 'cabin_staff_service',
'food_and_beverages', 'inflight_entertainment',
'ground_service', 'wifi_and_connectivity', 'value_for_money']
for rating in services_ratings:
target_stars = article.find('td', class_="review-rating-header " + rating)
try:
rating_td = target_stars.find_next('td', class_='review-rating-stars') if target_stars else 'NA'
ratings_dict[rating] = len(rating_td.find_all('span', class_='star fill')) if rating_td else 'NA'
except AttributeError:
pass
data = general_dict | passenger_details_dict | ratings_dict
data_list.append(data)
return data_list
def scrape_data():
page_number = 1
while True:
print(f'Scraping from page {page_number}')
raw_data = get_soup(page_count=page_number, page_size=input_page_size)
if len(raw_data) == 0:
print("No more pages to scrap")
break
extract_raw_data(raw_data)
time.sleep(int(input_sleep_time))
page_number += 1
df = pd.DataFrame(data_list)
print(df.shape)
return df.to_csv(output_data)
if __name__ == "__main__":
scrape_data()