forked from maria-antoniak/goodreads-scraper
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathget_books.py
294 lines (229 loc) · 11.6 KB
/
get_books.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
import argparse
from datetime import datetime
import json
import os
import re
import time
from urllib.request import urlopen
from urllib.error import HTTPError
import bs4
import pandas as pd
#Doesn't Work
# def get_all_lists(soup):
# lists = []
# list_count_dict = {}
# if soup.find('a', text='More lists with this book...'):
# lists_url = soup.find('a', text='More lists with this book...')['href']
# source = urlopen('https://www.goodreads.com' + lists_url)
# soup = bs4.BeautifulSoup(source, 'lxml')
# lists += [' '.join(node.text.strip().split()) for node in soup.find_all('div', {'class': 'cell'})]
# i = 0
# while soup.find('a', {'class': 'next_page'}) and i <= 10:
# time.sleep(2)
# next_url = 'https://www.goodreads.com' + soup.find('a', {'class': 'next_page'})['href']
# source = urlopen(next_url)
# soup = bs4.BeautifulSoup(source, 'lxml')
# lists += [node.text for node in soup.find_all('div', {'class': 'cell'})]
# i += 1
# # Format lists text.
# for _list in lists:
# # _list_name = ' '.join(_list.split()[:-8])
# # _list_rank = int(_list.split()[-8][:-2])
# # _num_books_on_list = int(_list.split()[-5].replace(',', ''))
# # list_count_dict[_list_name] = _list_rank / float(_num_books_on_list) # TODO: switch this back to raw counts
# _list_name = _list.split()[:-2][0]
# _list_count = int(_list.split()[-2].replace(',', ''))
# list_count_dict[_list_name] = _list_count
# return list_count_dict
#Doesn't work
# def get_shelves(soup):
# shelf_count_dict = {}
# if soup.find('a', text='See top shelves…'):
# # Find shelves text.
# shelves_url = soup.find('a', text='See top shelves…')['href']
# source = urlopen('https://www.goodreads.com' + shelves_url)
# soup = bs4.BeautifulSoup(source, 'lxml')
# shelves = [' '.join(node.text.strip().split()) for node in soup.find_all('div', {'class': 'shelfStat'})]
# # Format shelves text.
# shelf_count_dict = {}
# for _shelf in shelves:
# _shelf_name = _shelf.split()[:-2][0]
# _shelf_count = int(_shelf.split()[-2].replace(',', ''))
# shelf_count_dict[_shelf_name] = _shelf_count
# return shelf_count_dict
def get_genres(soup):
genres_div = soup.find("div", {"data-testid": "genresList"})
if genres_div:
genre_links = genres_div.find_all("a", class_="Button--tag-inline")
genres = [link.text.strip() for link in genre_links]
return genres
else:
return []
def get_series_name(soup):
series = soup.find(id="bookSeries").find("a")
if series:
series_name = re.search(r'\((.*?)\)', series.text).group(1)
return series_name
else:
return ""
def get_series_uri(soup):
series = soup.find(id="bookSeries").find("a")
if series:
series_uri = series.get("href")
return series_uri
else:
return ""
# def get_top_5_other_editions(soup):
# other_editions = []
# for div in soup.findAll('div', {'class': 'otherEdition'}):
# other_editions.append(div.find('a')['href'])
# return other_editions
def get_publication_info(soup):
publication_info_list = []
a = soup.find_all('div', class_='FeaturedDetails')
for item in a:
publication_info = item.find('p', {'data-testid': 'publicationInfo'}).text
publication_info_list.append(publication_info)
return publication_info_list
def get_num_pages(soup):
number_of_pages_list = []
featured_details = soup.find_all('div', class_='FeaturedDetails')
for item in featured_details:
format_info = item.find('p', {'data-testid': 'pagesFormat'})
if format_info:
format_text = format_info.text
parts = format_text.split(', ')
if len(parts) == 2:
number_of_pages, _ = parts
# Extract the number from the string
number_of_pages = ''.join(filter(str.isdigit, number_of_pages))
number_of_pages_list.append(number_of_pages)
elif len(parts) == 1:
# Check if it's a number
if parts[0].isdigit():
number_of_pages_list.append(parts[0])
else:
number_of_pages_list.append(None)
else:
number_of_pages_list.append(None)
else:
number_of_pages_list.append(None)
return number_of_pages_list
def get_format_info(soup):
format_info_list = []
a = soup.find_all('div', class_='FeaturedDetails')
for item in a:
format_info = item.find('p', {'data-testid': 'pagesFormat'}).text
format_info_list.append(format_info)
return format_info_list
def get_rating_distribution(soup):
rating_numbers = {}
try:
rating_bars = soup.find_all('div', class_='RatingsHistogram__bar')
# Iterate through each rating bar
for rating_bar in rating_bars:
try:
# Extract the rating (number of stars) from the aria-label attribute
rating = rating_bar['aria-label'].split()[0]
# Extract the number of ratings from the aria-label attribute of the labelTotal div
label_total = rating_bar.find('div', class_='RatingsHistogram__labelTotal')
num_ratings = label_total.get_text().split(' ')[0]
# Store the rating number in the dictionary
rating_numbers[rating] = num_ratings
except (KeyError, IndexError) as e:
print(f"Error occurred while extracting rating number for {rating}: {e}")
# Set rating number to 0 if not found
rating_numbers[rating] = '0'
except AttributeError as e:
print(f"Error occurred while finding rating bars: {e}")
return rating_numbers
def get_cover_image_uri(soup):
series = soup.find('img', class_='ResponsiveImage')
if series:
series_uri = series.get('src')
return series_uri
else:
return ""
def book_details(soup):
try:
return soup.find('div',class_='DetailsLayoutRightParagraph').text
except:
return ' '
def contributor_info(soup):
contributor = soup.find('a', {'class': 'ContributorLink'})
return contributor
def scrape_book(book_id):
url = 'https://www.goodreads.com/book/show/' + book_id
source = urlopen(url)
soup = bs4.BeautifulSoup(source, 'html.parser')
time.sleep(2)
return { 'book_id': book_id,#done
# 'book_id': get_id(book_id),# redundant
'cover_image_uri': get_cover_image_uri(soup),#done
'book_title': ' '.join(soup.find('h1', {'data-testid': 'bookTitle'}).text.split()),#done
'book_details': book_details(soup), #Added
# "book_series": get_series_name(soup), # cannot find
# "book_series_uri": get_series_uri(soup), # cannot find
# 'top_5_other_editions': get_top_5_other_editions(soup),# inside DOM element
# 'isbn': get_isbn(soup), # inside DOM element
# 'isbn13': get_isbn13(soup), # inside DOM element
'format': get_format_info(soup),#Added
'publication_info': get_publication_info(soup),#Added
'authorlink': contributor_info(soup)['href'],#done
'author': contributor_info(soup).find('span', {'class': 'ContributorLink__name'}).text.strip(),#done
'num_pages': get_num_pages(soup),#Added/done
'genres': get_genres(soup),#done
# 'shelves': get_shelves(soup),#doesn't work
# 'lists': get_all_lists(soup),#doesn't work
'num_ratings': ''.join(filter(str.isdigit, soup.find('span', {'data-testid': 'ratingsCount'}).text)),#done
'num_reviews': ''.join(filter(str.isdigit, soup.find('span', {'data-testid': 'reviewsCount'}).text)),#done
'average_rating': soup.find('div', {'class': 'RatingStatistics__rating'}).text.strip(),#done
'rating_distribution': get_rating_distribution(soup)}#done
def condense_books(books_directory_path):
books = []
# Look for all the files in the directory and if they contain "book-metadata," then load them all and condense them into a single file
for file_name in os.listdir(books_directory_path):
if file_name.endswith('.json') and not file_name.startswith('.') and file_name != "all_books.json" and "book-metadata" in file_name:
_book = json.load(open(books_directory_path + '/' + file_name, 'r')) #, encoding='utf-8', errors='ignore'))
books.append(_book)
return books
def main():
start_time = datetime.now()
script_name = os.path.basename(__file__)
parser = argparse.ArgumentParser()
parser.add_argument('--book_ids_path', type=str)
parser.add_argument('--output_directory_path', type=str)
parser.add_argument('--format', type=str, action="store", default="json",
dest="format", choices=["json", "csv"],
help="set file output format")
args = parser.parse_args()
book_ids = [line.strip() for line in open(args.book_ids_path, 'r') if line.strip()]
books_already_scraped = [file_name.replace('_book-metadata.json', '') for file_name in os.listdir(args.output_directory_path) if file_name.endswith('.json') and not file_name.startswith('all_books')]
books_to_scrape = [book_id for book_id in book_ids if book_id not in books_already_scraped]
condensed_books_path = args.output_directory_path + '/all_books'
for i, book_id in enumerate(books_to_scrape):
try:
print(str(datetime.now()) + ' ' + script_name + ': Scraping ' + book_id + '...')
print(str(datetime.now()) + ' ' + script_name + ': #' + str(i+1+len(books_already_scraped)) + ' out of ' + str(len(book_ids)) + ' books')
book = scrape_book(book_id)
# Add book metadata to file name to be more specific
json.dump(book, open(args.output_directory_path + '/' + book_id + '_book-metadata.json', 'w'),indent=4) # Added indent to make it more readable
print('=============================')
# This will handle if the Page doesn't exist or if there is no book title on the Goodreads website,
# (we get an attribute error representing there is no H1 tag on the page), Right now There are no H1 tags on the pages which dont exist.
except AttributeError as e:
print(e)
continue
except HTTPError as e:
print(e)
exit(0)
books = condense_books(args.output_directory_path)
if args.format == 'json':
json.dump(books, open(f"{condensed_books_path}.json", 'w'),indent=4)
elif args.format == 'csv':
json.dump(books, open(f"{condensed_books_path}.json", 'w'))
book_df = pd.read_json(f"{condensed_books_path}.json")
book_df.to_csv(f"{condensed_books_path}.csv", index=False, encoding='utf-8')
print(str(datetime.now()) + ' ' + script_name + f':\n\n🎉 Success! All book metadata scraped. 🎉\n\nMetadata files have been output to /{args.output_directory_path}\nGoodreads scraping run time = ⏰ ' + str(datetime.now() - start_time) + ' ⏰')
if __name__ == '__main__':
main()