-
Notifications
You must be signed in to change notification settings - Fork 0
/
scrape_course.py
155 lines (128 loc) · 5.96 KB
/
scrape_course.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# Trying to import and if missing, installing them and importing them.
for import_tries in range(0,2):
try:
from bs4 import BeautifulSoup
import requests
import os
import pandas as pd
import re
except ImportError:
os.system('pip install beautifulsoup4')
os.system('pip install requests')
os.system('pip install pandas')
def fetch_text_between_braces(my_string):
start = '('
end = ')'
my_filtered_text = my_string[my_string.find(start)+1 : my_string.find(end)]
return my_filtered_text
class scrape_course:
# Deleted course info.
deleted_course_titles = []
deleted_course_links = []
deleted_course_dictionary = {}
# Existing course info.
existing_course_titles = []
existing_course_links = []
existing_course_ratings = []
existing_course_total_ratings = []
existing_course_total_students = []
existing_course_length = []
existing_course_dictionary = {}
def __init__(self):
pass
def fetch_link_info(self, filter_rating):
# Declaring and instantiating deleted course info.
deleted_course_titles = self.deleted_course_titles
deleted_course_links = self.deleted_course_links
deleted_course_dictionary = self.deleted_course_dictionary
# Declaring and instantiating existing course info.
existing_course_titles = self.existing_course_titles
existing_course_links = self.existing_course_links
existing_course_ratings = self.existing_course_ratings
existing_course_total_ratings = self.existing_course_total_ratings
existing_course_total_students = self.existing_course_total_students
existing_course_length = self.existing_course_length
existing_course_dictionary = self.existing_course_dictionary
"""Start Dev count"""
count = 0
with open('all_my_links_file.txt', mode = 'r', encoding = 'utf-8') as links_file:
for single_link in links_file:
if 'draft' in single_link:
deleted_course_titles.append(single_link.split('->')[1])
deleted_course_links.append(single_link.split('->')[2])
"""Start Dev count"""
count += 1
print(count, "Inside Deleted Course.")
"""Stop Dev count"""
else:
try:
#scrape page info, check if rating is >= filter_rating , if it is, then scrape info and keep in list
#scrape data - title, rating, total ratings, total students, total time of course length, link
course_title = course_rating = total_ratings = total_students = total_time = ''
my_page = requests.get(single_link)
soup = BeautifulSoup(my_page.content, "lxml")
course_rating = soup.find(attrs={"data-purpose":"rating-number"}).string #Fetching the course rating.
course_rating = float(course_rating)
"""Start Dev count"""
count += 1
"""Stop Dev count"""
print(count, course_rating, "Indside Existing Course.")
if filter_rating >= course_rating:
course_title = soup.h1.string #Fetching the course title.
total_ratings = soup.find(attrs={"data-purpose":"rating"}).text #Fetching total ratings.
total_students = soup.find(attrs={"data-purpose":"enrollment"}).text #Fetching total students.
try:
total_time = soup.find(attrs={"data-purpose":"video-content-length"}).text #Fetching total course length.
except AttributeError:
print('Free Course Page, scrape total time here.')
except Exception:
print('-----------------------------Inside Inner General Exception-----------------------------')
print("Error in Link: ", single_link)
return (deleted_course_dictionary, existing_course_dictionary)
finally:
#Appending the scraped data to the lists.
print('-----------------------------Inside Inner Finally-----------------------------')
existing_course_titles.append(course_title)
existing_course_ratings.append(course_rating)
existing_course_total_ratings.append(total_ratings)
existing_course_total_students.append(total_students)
existing_course_length.append(total_time)
existing_course_links.append(single_link)
"""Start Dev finally"""
deleted_course_dictionary = {'Title':deleted_course_titles, 'Link':deleted_course_links}
existing_course_dictionary = {'Title':existing_course_titles,
'Ratings':existing_course_ratings,
'Total Ratings':existing_course_total_ratings,
'Total Students':existing_course_total_students,
'Total Length':existing_course_length,
'Link':existing_course_links}
"""Stop Dev finally"""
else:
pass
except:
print('-----------------------------Inside Outer General Exception-----------------------------')
pass
# Creating the dictionaries for existing courses and deleted courses using the scraped data and returning them.
deleted_course_dictionary = {'Title':deleted_course_titles, 'Link':deleted_course_links}
existing_course_dictionary = {'Title':existing_course_titles,
'Ratings':existing_course_ratings,
'Total Ratings':existing_course_total_ratings,
'Total Students':existing_course_total_students,
'Total Length':existing_course_length,
'Link':existing_course_links}
print(deleted_course_dictionary)
print(existing_course_dictionary)
return (deleted_course_dictionary, existing_course_dictionary)
class generate_csv():
def __init__(self, my_deleted_dictionary, my_filtered_courses_dictionary):
data_frame = pd.DataFrame(my_deleted_dictionary)
data_frame.to_csv('deleted_courses.csv')
data_frame = pd.DataFrame(my_filtered_courses_dictionary)
data_frame.to_csv('my_filtered_courses.csv')
if __name__ == '__main__':
scrape_course = scrape_course()
filter_rating = float(input('Enter the minimum rating you want the course to be: '))
print(f'Entered rating is: {filter_rating}')
deleted_course_dictionary, existing_course_dictionary = scrape_course.fetch_link_info(filter_rating)
#Dumping the data to csv files
generate_csv = generate_csv(deleted_course_dictionary, existing_course_dictionary)