Skip to content

Commit 8bd209f

Browse files
authored
Merge pull request #220 from axblueblader/master
[Usable] fixed person.py working as of 17 May
2 parents 7807196 + 896c5f3 commit 8bd209f

File tree

4 files changed

+71
-112
lines changed

4 files changed

+71
-112
lines changed

README.md

Lines changed: 1 addition & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
# Linkedin Scraper
2+
Forked from https://github.com/joeyism/linkedin_scraper
23

34
Scrapes Linkedin User Data
45

@@ -59,21 +60,6 @@ First, you must set your chromedriver location by
5960
export CHROMEDRIVER=~/chromedriver
6061
```
6162

62-
## Sponsor
63-
[![rds-cost](https://raw.githubusercontent.com/joeyism/linkedin_scraper/master/docs/proxycurl.png)](https://nubela.co/proxycurl/?utm_campaign=influencer%20marketing&utm_source=github&utm_medium=social&utm_term=-&utm_content=joeyism)
64-
65-
Scrape public LinkedIn profile data at scale with [Proxycurl APIs](https://nubela.co/proxycurl/?utm_campaign=influencer%20marketing&utm_source=github&utm_medium=social&utm_term=-&utm_content=joeyism).
66-
67-
• Scraping Public profiles are battle tested in court in HiQ VS LinkedIn case.<br/>
68-
• GDPR, CCPA, SOC2 compliant<br/>
69-
• High rate limit - 300 requests/minute<br/>
70-
• Fast - APIs respond in ~2s<br/>
71-
• Fresh data - 88% of data is scraped real-time, other 12% are not older than 29 days<br/>
72-
• High accuracy<br/>
73-
• Tons of data points returned per profile
74-
75-
Built for developers, by developers.
76-
7763
## Usage
7864
To use it, just create the class.
7965

@@ -283,7 +269,3 @@ company = Company("https://ca.linkedin.com/company/google", driver=driver)
283269

284270
#### `scrape(close_on_complete=True)`
285271
This is the meat of the code, where execution of this function scrapes the company. If *close_on_complete* is True (which it is by default), then the browser will close upon completion. If scraping of other companies are desired, then you might want to set that to false so you can keep using the same driver.
286-
287-
## Contribution
288-
289-
<a href="https://www.buymeacoffee.com/joeyism" target="_blank"><img src="https://www.buymeacoffee.com/assets/img/custom_images/orange_img.png" alt="Buy Me A Coffee" style="height: 41px !important;width: 174px !important;box-shadow: 0px 3px 2px 0px rgba(190, 190, 190, 0.5) !important;-webkit-box-shadow: 0px 3px 2px 0px rgba(190, 190, 190, 0.5) !important;" ></a>

linkedin_scraper/company.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -184,11 +184,11 @@ def scrape_logged_in(self, get_employees = True, close_on_complete = True):
184184

185185
driver.get(self.linkedin_url)
186186

187-
_ = WebDriverWait(driver, 3).until(EC.presence_of_all_elements_located((By.XPATH, '//span[@dir="ltr"]')))
187+
_ = WebDriverWait(driver, 3).until(EC.presence_of_all_elements_located((By.XPATH, '//div[@dir="ltr"]')))
188188

189189
navigation = driver.find_element(By.CLASS_NAME, "org-page-navigation__items ")
190190

191-
self.name = driver.find_element(By.XPATH,'//span[@dir="ltr"]').text.strip()
191+
self.name = driver.find_element(By.CLASS_NAME,"org-top-card-summary__title").text.strip()
192192

193193
# Click About Tab or View All Link
194194
try:
@@ -360,6 +360,6 @@ def __repr__(self):
360360
_output['affiliated_companies'] = self.affiliated_companies
361361
_output['employees'] = self.employees
362362
_output['headcount'] = self.headcount
363-
363+
364364
return json.dumps(_output).replace('\n', '')
365365

linkedin_scraper/person.py

Lines changed: 36 additions & 86 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,5 @@
1+
import time
2+
13
import requests
24
from selenium import webdriver
35
from selenium.webdriver.common.by import By
@@ -11,7 +13,7 @@
1113

1214
class Person(Scraper):
1315

14-
__TOP_CARD = "pv-top-card"
16+
__TOP_CARD = "scaffold-layout__main"
1517
__WAIT_FOR_ELEMENT_TIMEOUT = 5
1618

1719
def __init__(
@@ -113,13 +115,15 @@ def get_experiences(self):
113115
main = self.wait_for_element_to_load(by=By.TAG_NAME, name="main")
114116
self.scroll_to_half()
115117
self.scroll_to_bottom()
116-
main_list = self.wait_for_element_to_load(name="pvs-list", base=main)
117-
for position in main_list.find_elements(By.XPATH,"li"):
118-
position = position.find_element(By.CLASS_NAME,"pvs-entity")
119-
company_logo_elem, position_details = position.find_elements(By.XPATH,"*")
118+
main_list = self.wait_for_element_to_load(name="pvs-list__container", base=main)
119+
for position in main_list.find_elements(By.CLASS_NAME, "pvs-list__paged-list-item"):
120+
position = position.find_element(By.CSS_SELECTOR, "div[data-view-name='profile-component-entity']")
121+
company_logo_elem, position_details = position.find_elements(By.XPATH, "*")
120122

121123
# company elem
122124
company_linkedin_url = company_logo_elem.find_element(By.XPATH,"*").get_attribute("href")
125+
if not company_linkedin_url:
126+
continue
123127

124128
# position details
125129
position_details_list = position_details.find_elements(By.XPATH,"*")
@@ -143,15 +147,26 @@ def get_experiences(self):
143147
company = outer_positions[0].find_element(By.TAG_NAME,"span").text
144148
work_times = outer_positions[1].find_element(By.TAG_NAME,"span").text
145149
location = outer_positions[2].find_element(By.TAG_NAME,"span").text
150+
else:
151+
position_title = ""
152+
company = outer_positions[0].find_element(By.TAG_NAME,"span").text
153+
work_times = ""
154+
location = ""
155+
146156

147157
times = work_times.split("·")[0].strip() if work_times else ""
148158
duration = work_times.split("·")[1].strip() if len(work_times.split("·")) > 1 else None
149159

150160
from_date = " ".join(times.split(" ")[:2]) if times else ""
151161
to_date = " ".join(times.split(" ")[3:]) if times else ""
152-
153-
if position_summary_text and len(position_summary_text.find_element(By.CLASS_NAME,"pvs-list").find_element(By.CLASS_NAME,"pvs-list").find_elements(By.XPATH,"li")) > 1:
154-
descriptions = position_summary_text.find_element(By.CLASS_NAME,"pvs-list").find_element(By.CLASS_NAME,"pvs-list").find_elements(By.XPATH,"li")
162+
if position_summary_text and any(element.get_attribute("pvs-list__container") for element in position_summary_text.find_elements(By.TAG_NAME, "*")):
163+
inner_positions = (position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container")
164+
.find_element(By.XPATH,"*").find_element(By.XPATH,"*").find_element(By.XPATH,"*")
165+
.find_elements(By.CLASS_NAME,"pvs-list__paged-list-item"))
166+
else:
167+
inner_positions = []
168+
if len(inner_positions) > 1:
169+
descriptions = inner_positions
155170
for description in descriptions:
156171
res = description.find_element(By.TAG_NAME,"a").find_elements(By.XPATH,"*")
157172
position_title_elem = res[0] if len(res) > 0 else None
@@ -200,8 +215,9 @@ def get_educations(self):
200215
main = self.wait_for_element_to_load(by=By.TAG_NAME, name="main")
201216
self.scroll_to_half()
202217
self.scroll_to_bottom()
203-
main_list = self.wait_for_element_to_load(name="pvs-list", base=main)
204-
for position in main_list.find_elements(By.CLASS_NAME,"pvs-entity"):
218+
main_list = self.wait_for_element_to_load(name="pvs-list__container", base=main)
219+
for position in main_list.find_elements(By.CLASS_NAME,"pvs-list__paged-list-item"):
220+
position = position.find_element(By.XPATH,"//div[@data-view-name='profile-component-entity']")
205221
institution_logo_elem, position_details = position.find_elements(By.XPATH,"*")
206222

207223
# company elem
@@ -214,13 +230,17 @@ def get_educations(self):
214230
outer_positions = position_summary_details.find_element(By.XPATH,"*").find_elements(By.XPATH,"*")
215231

216232
institution_name = outer_positions[0].find_element(By.TAG_NAME,"span").text
217-
degree = outer_positions[1].find_element(By.TAG_NAME,"span").text
233+
if len(outer_positions) > 1:
234+
degree = outer_positions[1].find_element(By.TAG_NAME,"span").text
235+
else:
236+
degree = None
218237

219238
if len(outer_positions) > 2:
220239
times = outer_positions[2].find_element(By.TAG_NAME,"span").text
221240

222-
from_date = " ".join(times.split(" ")[:2])
223-
to_date = " ".join(times.split(" ")[3:])
241+
if times != "":
242+
from_date = times.split(" ")[times.split(" ").index("-")-1] if len(times.split(" "))>3 else times.split(" ")[0]
243+
to_date = times.split(" ")[-1]
224244
else:
225245
from_date = None
226246
to_date = None
@@ -240,10 +260,9 @@ def get_educations(self):
240260
self.add_education(education)
241261

242262
def get_name_and_location(self):
243-
top_panels = self.driver.find_elements(By.CLASS_NAME,"pv-text-details__left-panel")
244-
self.name = top_panels[0].find_elements(By.XPATH,"*")[0].text
245-
self.location = top_panels[1].find_element(By.TAG_NAME,"span").text
246-
263+
top_panel = self.driver.find_element(By.XPATH, "//*[@class='mt2 relative']")
264+
self.name = top_panel.find_element(By.TAG_NAME, "h1").text
265+
self.location = top_panel.find_element(By.XPATH, "//*[@class='text-body-small inline t-black--light break-words']").text
247266

248267
def get_about(self):
249268
try:
@@ -288,75 +307,6 @@ def scrape_logged_in(self, close_on_complete=True):
288307
self.get_educations()
289308

290309
driver.get(self.linkedin_url)
291-
292-
# get interest
293-
try:
294-
295-
_ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
296-
EC.presence_of_element_located(
297-
(
298-
By.XPATH,
299-
"//*[@class='pv-profile-section pv-interests-section artdeco-container-card artdeco-card ember-view']",
300-
)
301-
)
302-
)
303-
interestContainer = driver.find_element(By.XPATH,
304-
"//*[@class='pv-profile-section pv-interests-section artdeco-container-card artdeco-card ember-view']"
305-
)
306-
for interestElement in interestContainer.find_elements(By.XPATH,
307-
"//*[@class='pv-interest-entity pv-profile-section__card-item ember-view']"
308-
):
309-
interest = Interest(
310-
interestElement.find_element(By.TAG_NAME, "h3").text.strip()
311-
)
312-
self.add_interest(interest)
313-
except:
314-
pass
315-
316-
# get accomplishment
317-
try:
318-
_ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
319-
EC.presence_of_element_located(
320-
(
321-
By.XPATH,
322-
"//*[@class='pv-profile-section pv-accomplishments-section artdeco-container-card artdeco-card ember-view']",
323-
)
324-
)
325-
)
326-
acc = driver.find_element(By.XPATH,
327-
"//*[@class='pv-profile-section pv-accomplishments-section artdeco-container-card artdeco-card ember-view']"
328-
)
329-
for block in acc.find_elements(By.XPATH,
330-
"//div[@class='pv-accomplishments-block__content break-words']"
331-
):
332-
category = block.find_element(By.TAG_NAME, "h3")
333-
for title in block.find_element(By.TAG_NAME,
334-
"ul"
335-
).find_elements(By.TAG_NAME, "li"):
336-
accomplishment = Accomplishment(category.text, title.text)
337-
self.add_accomplishment(accomplishment)
338-
except:
339-
pass
340-
341-
# get connections
342-
try:
343-
driver.get("https://www.linkedin.com/mynetwork/invite-connect/connections/")
344-
_ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
345-
EC.presence_of_element_located((By.CLASS_NAME, "mn-connections"))
346-
)
347-
connections = driver.find_element(By.CLASS_NAME, "mn-connections")
348-
if connections is not None:
349-
for conn in connections.find_elements(By.CLASS_NAME, "mn-connection-card"):
350-
anchor = conn.find_element(By.CLASS_NAME, "mn-connection-card__link")
351-
url = anchor.get_attribute("href")
352-
name = conn.find_element(By.CLASS_NAME, "mn-connection-card__details").find_element(By.CLASS_NAME, "mn-connection-card__name").text.strip()
353-
occupation = conn.find_element(By.CLASS_NAME, "mn-connection-card__details").find_element(By.CLASS_NAME, "mn-connection-card__occupation").text.strip()
354-
355-
contact = Contact(name=name, occupation=occupation, url=url)
356-
self.add_contact(contact)
357-
except:
358-
connections = None
359-
360310
if close_on_complete:
361311
driver.quit()
362312

samples/scrape_person.py

Lines changed: 31 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,36 @@
11
import os
2-
from linkedin_scraper import Person, actions
2+
from linkedin_scraper import Person, actions, Company
33
from selenium import webdriver
4-
driver = webdriver.Chrome("./chromedriver")
4+
5+
driver = webdriver.Chrome()
56

67
email = os.getenv("LINKEDIN_USER")
78
password = os.getenv("LINKEDIN_PASSWORD")
8-
actions.login(driver, email, password) # if email and password isnt given, it'll prompt in terminal
9-
person = Person("https://www.linkedin.com/in/andre-iguodala-65b48ab5", driver=driver)
9+
actions.login(driver, email, password) # if email and password isnt given, it'll prompt in terminal
10+
user_input = []
11+
urls = []
12+
while True:
13+
user_input = input("Enter a comma-separated list of linkedin urls: ")
14+
if user_input == "exit":
15+
break
16+
urls = user_input.split(",")
17+
results = []
18+
for url in urls:
19+
print(f'scraping {url}')
20+
person = Person(url, driver=driver, close_on_complete=False)
21+
company = Company(person.experiences[0].linkedin_url, get_employees=False, driver=driver, close_on_complete=False)
22+
results.append((person, company))
23+
24+
print('RESULTS:')
25+
print('name,location,exp_title,exp_company,exp_linkedin,company_industry,company_website,company_size')
26+
for person, company in results:
27+
experience = person.experiences[0]
28+
print(f'"{person.name}", '
29+
f'"{person.location}", '
30+
f'"{experience.position_title}", '
31+
f'"{experience.institution_name}", '
32+
f'"{experience.linkedin_url}", '
33+
f'"{company.industry}", '
34+
f'"{company.website}", '
35+
f'"{company.company_size}", '
36+
)

0 commit comments

Comments
 (0)