-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraping.py
191 lines (148 loc) · 6.64 KB
/
scraping.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
# -*- coding: utf-8 -*-
"""
Created on Sat Jun 3 11:54:07 2023
@author: spika
"""
from selenium import webdriver
from selenium.webdriver.firefox.service import Service
from selenium.webdriver.common.by import By
from webdriver_manager.firefox import GeckoDriverManager
from bs4 import BeautifulSoup
import time
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException
import re
from selenium.common.exceptions import TimeoutException, ElementNotInteractableException
import csv
import pandas as pd
def initialize_browser(n):
driver = webdriver.Firefox(service=Service(GeckoDriverManager().install()))
driver.get("https://flexbooks.ck12.org/user:zxbpc2rzcziwmthaz21hawwuy29t/cbook/world-history-studies_episd/")
print("starting driver")
wait = WebDriverWait(driver, 50)
while True:
# Scroll down
driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
try:
click_button = wait.until(EC.element_to_be_clickable((By.ID, 'radix-'+str(n))))
click_button.click()
break
except (TimeoutException, ElementNotInteractableException):
continue
return driver
def get_links(parent):
time.sleep(1)
chapter_links = {}
if parent:
desired_elements = parent.find_all('a')
for i in range(len(desired_elements)):
chapter_links[desired_elements[i].text] = desired_elements[i]['href']
else:
print("No parent element found.")
return chapter_links
#________________Part2________________
def doc_link(new):
url = "https://flexbooks.ck12.org/user:zxbpc2rzcziwmthaz21hawwuy29t/cbook/world-history-studies_episd"+ new
return url
def extract_info(chapters):
data = {}
for i in range(0, len(chapters)):
# for i in range(0, 1):
for key in chapters[i].keys():
with webdriver.Firefox(service=Service(GeckoDriverManager().install())) as driver:
driver.set_page_load_timeout(60)# Set timeout to 30 seconds
link = doc_link(chapters[i][key])
try:
driver.get(link)
except TimeoutException:
print("Loading took too much time!")
# Here you can also handle what to do if the page load takes too much time
time.sleep(7)
soup = BeautifulSoup(driver.page_source, 'html.parser')
div_element = soup.find('div', class_='x-ck12-data-concept')
body_text = []
if div_element is not None:
content = div_element.get_text(strip=True)
body_text.append(content)
# p_elements = soup.find_all('p')
# if p_elements is not None:
# body_text = [p.get_text() for p in p_elements]
table_elements = soup.find_all('table')
if table_elements is not None:
for table in table_elements:
rows = table.find_all('tr')
for row in rows:
cells = row.find_all('td')
for cell in cells:
body_text.append(cell.get_text())
# d = soup.find('div', {'class': f"x-ck12-data-concept"})
# table_content = []
# body_text = []
# if d is not None:
# ps = d.find_all('p')
# tables = soup.find_all('table')
# if ps is not None:
# for p in ps:
# body_text.append(p.get_text())
# if tables is not None:
# for table in tables:
# rows = table.find_all('tr')
# table_data = []
# for row in rows:
# cells = row.find_all('td')
# row_data = [cell.get_text() for cell in cells]
# table_data.append(row_data)
# body_text.append(table_data)
# body_text = [tag.get_text() for tag in soup.body.descendants if tag.name]
match = re.match(r'(\d+\.\d+)\xa0\xa0(.*)', key)
#convert nested list to one
# dt = [item for sublist in body_text for item in sublist]
body_text = " ".join(body_text)
print(body_text)
if match:
number, title = match.groups()
print(number, "loading")
data[number] = {"title": title, "body_text": body_text}
return data
# {x: {y :[]}}
def replace_pattern_in_urls(chapter_links_list, pattern):
updated_links_list = []
for chapter_links in chapter_links_list:
updated_links = {}
for chapter_name, url in chapter_links.items():
# Substitute the pattern with an empty string
url_clean = re.sub(pattern, '', url)
updated_links[chapter_name] = url_clean
updated_links_list.append(updated_links)
return updated_links_list
#Get the chapters from the first page
chapters = []
for num, i in enumerate(range(1, 20, 2)):
driver = initialize_browser(i)
soup = BeautifulSoup(driver.page_source, 'html.parser')
parent = soup.find('div', {'id': f"course_item_child_{num+1}"})
chapters.append(get_links(parent))
# import pickle
# with open('chapters.pickle', 'wb') as handle:
# pickle.dump(chapters, handle, protocol=pickle.HIGHEST_PROTOCOL)
import pickle
with open('chapters.pickle', 'rb') as handle:
chapters = pickle.load(handle)
#clean the data
pattern = '/user:zxbpc2rzcziwmthaz21hawwuy29t/cbook/world-history-studies_episd'
cleaned_chapters = replace_pattern_in_urls(chapters, pattern)
#iterate in the evry page and gather the data
# data = extract_info(cleaned_chapters)
#load picle
with open('data1.pickle', 'rb') as handle:
data = pickle.load(handle)
#save picle
# with open('data1.pickle', 'wb') as handle:
# pickle.dump(data, handle, protocol=pickle.HIGHEST_PROTOCOL)
# Convert data into a list of dictionaries, adding 'number' as a key
data_list = [{"number": number, **info} for number, info in data.items()]
# Convert the list into a DataFrame
df = pd.DataFrame(data_list).reset_index()
# Write the DataFrame to a CSV file
df.to_csv("output.csv", index=False)