forked from lasamson/devpost-scraper
-
Notifications
You must be signed in to change notification settings - Fork 0
/
dps.py
125 lines (106 loc) · 4.45 KB
/
dps.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
import csv
from urllib.request import urlopen
from bs4 import BeautifulSoup
import summarize
class DPS:
def __init__(self, URLlst, n_subm_i, n_page_i, dontCheckWinner=True):
self.data = {}
self.URLlst = URLlst
self.n_subm = n_subm_i
self.n_page = n_page_i
self.dontCheckWinner = dontCheckWinner
def getData(self):
for j, baseUrl in enumerate(self.URLlst):
subsUrl = baseUrl + "//project-gallery?page="
count = 1
fieldsList = []
try:
while count <= self.n_page:
subsObj = BeautifulSoup(
urlopen(subsUrl + str(count)), "html.parser"
)
submissions = subsObj.findAll(
"a", {"class": "block-wrapper-link fade link-to-software"}
)
if len(submissions) != 0:
for i, submission in enumerate(submissions):
if i >= self.n_subm:
break
print("> Checking submission")
subUrl = submission.attrs["href"]
subObj = BeautifulSoup(urlopen(subUrl), "html.parser")
# if we are not checking winner, only dependent on isWinner check
if self.isWinner(subObj) or self.dontCheckWinner:
print("> Is a winner OR not checking for winners")
title = self.getTitle(subObj)
subtitle = self.getSubtitle(subObj, title)
description = self.getDescription(subObj)
builtWith = self.getBuiltWith(subObj)
print("> Adding to fieldsList")
print(f"\t> {title}")
fieldsList.append(
[
title.get_text().strip(),
subtitle.get_text().strip(),
description,
builtWith,
]
)
else:
print("> Not a winner and checking for winners")
print("FINISHED")
count = count + 1
else:
count = count + 1
self.data[self.URLlst[j]] = fieldsList
except:
pass
return self.data
def getTitle(self, subObj):
title = subObj.find("h1", {"id": "app-title"})
print(f"> {title.get_text().strip()}")
return title
def getSubtitle(self, subObj, title):
subtitle = title.parent.find("p")
return subtitle
# # return true if winner, else false
def isWinner(self, subObj):
if subObj.find_all("span", {"class": "winner"}):
print("\t> Submission is winner.")
subObj.find("span", {"class": "winner"})
return True
else:
print("\t> Not winner.")
return False
def getDescription(self, subObj):
div_content = subObj.find("div", {"id": "app-details-left"})
r_sets = div_content.find_all("p", {"id": False, "class": False})
desc = []
for result in r_sets:
desc.extend(result.getText())
desc = ("".join(desc)).replace("\n", ".")
return summarize.Summarize(5, desc).summarize(2)
def getImages(self, subObj):
imgList = []
try:
images = subObj.find("div", {"id": "gallery"}).findAll("li")
for image in images:
try:
imgSrc = image.find("img")["src"]
imgList.append(imgSrc)
except:
print("\t> Non-Image Link Found")
except:
print("\t> No Gallery Found")
return imgList
def getBuiltWith(self, subObj):
builtWithList = []
try:
builtWith = subObj.find("div", {"id": "built-with"}).findAll(
"span", {"class": "cp-tag"}
)
for tool in builtWith:
builtWithList.append(tool.get_text().strip())
except:
print("\t> No Tools Found")
return builtWithList