-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscraper_utils.py
148 lines (125 loc) · 4.04 KB
/
scraper_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
"""
In here utilities for discovering specific sub objects are collected.
"""
from urllib.request import urlopen
from bs4 import BeautifulSoup
def getTitle(subObj):
title = subObj.find("h1", {"id": "app-title"})
return title
def getSubtitle(subObj, title):
subtitle = title.parent.find("p")
return subtitle
def getContent(subObj):
try:
if subObj.find("div", {"id": "gallery"}) is None:
return (
subObj.find("div", {"id": "app-details-left"})
.find("div")
.get_text()
.strip()
)
else:
return (
subObj.find("div", {"id": "gallery"})
.parent.findNext("div")
.get_text()
.strip()
)
except:
return ""
def getLikes(subObj):
software_likes = subObj.find("div", {"class": "software-likes"})
try:
likes = software_likes.find("span", {"class": "side-count"}).get_text().strip()
except:
likes = 0
return likes
def getUpdates(subObj):
updateLst = []
try:
updates = subObj.find(
"div", {"class": "large-12 columns software-updates"}
).findAll("article")
for update in updates:
author = update.find("a", {"class": "user-profile-link"})
author_uname = author["href"].split("/")[-1]
when = update.find("time", {"class": "timeago"})["datetime"]
content = (
update.find("p", {"class": "author small"})
.find_next_sibling("p", class_="")
.get_text()
.strip()
)
updateLst.append([author_uname, content, when])
except:
print("No Updates found.")
return updateLst
def getParticipants(subObj):
participantLst = []
try:
participants = subObj.find("section", id="app-team").findAll("li")
for participant in participants:
member = participant.find("a", {"class": "user-profile-link"})
member_uname = member["href"].split("/")[-1]
participantLst.append(member_uname)
except:
print("No team members found.")
return participantLst
def getSkills(userObj):
skills = []
try:
skillObjs = userObj.find(
"ul", class_="portfolio-tags no-bullet inline-list"
).findAll("li")
for skillObj in skillObjs:
skills.append(skillObj.get_text().strip())
except:
print("No Skills found!")
return skills
def getinterests(userObj):
interests = []
try:
interObjs = (
userObj.find("div", class_="tag-list themes clearfix")
.find("ul", class_="no-bullet inline-list")
.findAll("li")
)
for interObj in interObjs:
interests.append(interObj.get_text().strip())
except:
print("No interests found!")
return interests
def getUserData(uname):
url = "https://devpost.com/" + uname
userObj = BeautifulSoup(urlopen(url), "html.parser")
natural_name = (
userObj.find("h1", id="portfolio-user-name").get_text().strip().split("\n")[0]
)
image = userObj.find("div", id="portfolio-user-photo").find("img")["src"]
skills = getSkills(userObj)
interests = getinterests(userObj)
return [uname, natural_name, skills, interests, image]
def getImages(subObj):
imgList = []
try:
images = subObj.find("div", {"id": "gallery"}).findAll("li")
for image in images:
try:
imgSrc = image.find("img")["src"]
imgList.append(imgSrc)
except:
print("Non-Image Link Found")
except:
print("No Gallery Found")
return imgList
def getBuiltWith(subObj):
builtWithList = []
try:
builtWith = subObj.find("div", {"id": "built-with"}).findAll(
"span", {"class": "cp-tag"}
)
for tool in builtWith:
builtWithList.append(tool.get_text().strip())
except:
print("No Tools Found")
return builtWithList