Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 1 addition & 19 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
# Linkedin Scraper
Forked from https://github.com/joeyism/linkedin_scraper

Scrapes Linkedin User Data

Expand Down Expand Up @@ -59,21 +60,6 @@ First, you must set your chromedriver location by
export CHROMEDRIVER=~/chromedriver
```

## Sponsor
[![rds-cost](https://raw.githubusercontent.com/joeyism/linkedin_scraper/master/docs/proxycurl.png)](https://nubela.co/proxycurl/?utm_campaign=influencer%20marketing&utm_source=github&utm_medium=social&utm_term=-&utm_content=joeyism)

Scrape public LinkedIn profile data at scale with [Proxycurl APIs](https://nubela.co/proxycurl/?utm_campaign=influencer%20marketing&utm_source=github&utm_medium=social&utm_term=-&utm_content=joeyism).

• Scraping Public profiles are battle tested in court in HiQ VS LinkedIn case.<br/>
• GDPR, CCPA, SOC2 compliant<br/>
• High rate limit - 300 requests/minute<br/>
• Fast - APIs respond in ~2s<br/>
• Fresh data - 88% of data is scraped real-time, other 12% are not older than 29 days<br/>
• High accuracy<br/>
• Tons of data points returned per profile

Built for developers, by developers.

## Usage
To use it, just create the class.

Expand Down Expand Up @@ -283,7 +269,3 @@ company = Company("https://ca.linkedin.com/company/google", driver=driver)

#### `scrape(close_on_complete=True)`
This is the meat of the code, where execution of this function scrapes the company. If *close_on_complete* is True (which it is by default), then the browser will close upon completion. If scraping of other companies are desired, then you might want to set that to false so you can keep using the same driver.

## Contribution

<a href="https://www.buymeacoffee.com/joeyism" target="_blank"><img src="https://www.buymeacoffee.com/assets/img/custom_images/orange_img.png" alt="Buy Me A Coffee" style="height: 41px !important;width: 174px !important;box-shadow: 0px 3px 2px 0px rgba(190, 190, 190, 0.5) !important;-webkit-box-shadow: 0px 3px 2px 0px rgba(190, 190, 190, 0.5) !important;" ></a>
6 changes: 3 additions & 3 deletions linkedin_scraper/company.py
Original file line number Diff line number Diff line change
Expand Up @@ -184,11 +184,11 @@ def scrape_logged_in(self, get_employees = True, close_on_complete = True):

driver.get(self.linkedin_url)

_ = WebDriverWait(driver, 3).until(EC.presence_of_all_elements_located((By.XPATH, '//span[@dir="ltr"]')))
_ = WebDriverWait(driver, 3).until(EC.presence_of_all_elements_located((By.XPATH, '//div[@dir="ltr"]')))

navigation = driver.find_element(By.CLASS_NAME, "org-page-navigation__items ")

self.name = driver.find_element(By.XPATH,'//span[@dir="ltr"]').text.strip()
self.name = driver.find_element(By.CLASS_NAME,"org-top-card-summary__title").text.strip()

# Click About Tab or View All Link
try:
Expand Down Expand Up @@ -360,6 +360,6 @@ def __repr__(self):
_output['affiliated_companies'] = self.affiliated_companies
_output['employees'] = self.employees
_output['headcount'] = self.headcount

return json.dumps(_output).replace('\n', '')

122 changes: 36 additions & 86 deletions linkedin_scraper/person.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import time

import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
Expand All @@ -11,7 +13,7 @@

class Person(Scraper):

__TOP_CARD = "pv-top-card"
__TOP_CARD = "scaffold-layout__main"
__WAIT_FOR_ELEMENT_TIMEOUT = 5

def __init__(
Expand Down Expand Up @@ -113,13 +115,15 @@ def get_experiences(self):
main = self.wait_for_element_to_load(by=By.TAG_NAME, name="main")
self.scroll_to_half()
self.scroll_to_bottom()
main_list = self.wait_for_element_to_load(name="pvs-list", base=main)
for position in main_list.find_elements(By.XPATH,"li"):
position = position.find_element(By.CLASS_NAME,"pvs-entity")
company_logo_elem, position_details = position.find_elements(By.XPATH,"*")
main_list = self.wait_for_element_to_load(name="pvs-list__container", base=main)
for position in main_list.find_elements(By.CLASS_NAME, "pvs-list__paged-list-item"):
position = position.find_element(By.CSS_SELECTOR, "div[data-view-name='profile-component-entity']")
company_logo_elem, position_details = position.find_elements(By.XPATH, "*")

# company elem
company_linkedin_url = company_logo_elem.find_element(By.XPATH,"*").get_attribute("href")
if not company_linkedin_url:
continue

# position details
position_details_list = position_details.find_elements(By.XPATH,"*")
Expand All @@ -143,15 +147,26 @@ def get_experiences(self):
company = outer_positions[0].find_element(By.TAG_NAME,"span").text
work_times = outer_positions[1].find_element(By.TAG_NAME,"span").text
location = outer_positions[2].find_element(By.TAG_NAME,"span").text
else:
position_title = ""
company = outer_positions[0].find_element(By.TAG_NAME,"span").text
work_times = ""
location = ""


times = work_times.split("·")[0].strip() if work_times else ""
duration = work_times.split("·")[1].strip() if len(work_times.split("·")) > 1 else None

from_date = " ".join(times.split(" ")[:2]) if times else ""
to_date = " ".join(times.split(" ")[3:]) if times else ""

if position_summary_text and len(position_summary_text.find_element(By.CLASS_NAME,"pvs-list").find_element(By.CLASS_NAME,"pvs-list").find_elements(By.XPATH,"li")) > 1:
descriptions = position_summary_text.find_element(By.CLASS_NAME,"pvs-list").find_element(By.CLASS_NAME,"pvs-list").find_elements(By.XPATH,"li")
if position_summary_text and any(element.get_attribute("pvs-list__container") for element in position_summary_text.find_elements(By.TAG_NAME, "*")):
inner_positions = (position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container")
.find_element(By.XPATH,"*").find_element(By.XPATH,"*").find_element(By.XPATH,"*")
.find_elements(By.CLASS_NAME,"pvs-list__paged-list-item"))
else:
inner_positions = []
if len(inner_positions) > 1:
descriptions = inner_positions
for description in descriptions:
res = description.find_element(By.TAG_NAME,"a").find_elements(By.XPATH,"*")
position_title_elem = res[0] if len(res) > 0 else None
Expand Down Expand Up @@ -200,8 +215,9 @@ def get_educations(self):
main = self.wait_for_element_to_load(by=By.TAG_NAME, name="main")
self.scroll_to_half()
self.scroll_to_bottom()
main_list = self.wait_for_element_to_load(name="pvs-list", base=main)
for position in main_list.find_elements(By.CLASS_NAME,"pvs-entity"):
main_list = self.wait_for_element_to_load(name="pvs-list__container", base=main)
for position in main_list.find_elements(By.CLASS_NAME,"pvs-list__paged-list-item"):
position = position.find_element(By.XPATH,"//div[@data-view-name='profile-component-entity']")
institution_logo_elem, position_details = position.find_elements(By.XPATH,"*")

# company elem
Expand All @@ -214,13 +230,17 @@ def get_educations(self):
outer_positions = position_summary_details.find_element(By.XPATH,"*").find_elements(By.XPATH,"*")

institution_name = outer_positions[0].find_element(By.TAG_NAME,"span").text
degree = outer_positions[1].find_element(By.TAG_NAME,"span").text
if len(outer_positions) > 1:
degree = outer_positions[1].find_element(By.TAG_NAME,"span").text
else:
degree = None

if len(outer_positions) > 2:
times = outer_positions[2].find_element(By.TAG_NAME,"span").text

from_date = " ".join(times.split(" ")[:2])
to_date = " ".join(times.split(" ")[3:])
if times != "":
from_date = times.split(" ")[times.split(" ").index("-")-1] if len(times.split(" "))>3 else times.split(" ")[0]
to_date = times.split(" ")[-1]
else:
from_date = None
to_date = None
Expand All @@ -240,10 +260,9 @@ def get_educations(self):
self.add_education(education)

def get_name_and_location(self):
top_panels = self.driver.find_elements(By.CLASS_NAME,"pv-text-details__left-panel")
self.name = top_panels[0].find_elements(By.XPATH,"*")[0].text
self.location = top_panels[1].find_element(By.TAG_NAME,"span").text

top_panel = self.driver.find_element(By.XPATH, "//*[@class='mt2 relative']")
self.name = top_panel.find_element(By.TAG_NAME, "h1").text
self.location = top_panel.find_element(By.XPATH, "//*[@class='text-body-small inline t-black--light break-words']").text

def get_about(self):
try:
Expand Down Expand Up @@ -288,75 +307,6 @@ def scrape_logged_in(self, close_on_complete=True):
self.get_educations()

driver.get(self.linkedin_url)

# get interest
try:

_ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
EC.presence_of_element_located(
(
By.XPATH,
"//*[@class='pv-profile-section pv-interests-section artdeco-container-card artdeco-card ember-view']",
)
)
)
interestContainer = driver.find_element(By.XPATH,
"//*[@class='pv-profile-section pv-interests-section artdeco-container-card artdeco-card ember-view']"
)
for interestElement in interestContainer.find_elements(By.XPATH,
"//*[@class='pv-interest-entity pv-profile-section__card-item ember-view']"
):
interest = Interest(
interestElement.find_element(By.TAG_NAME, "h3").text.strip()
)
self.add_interest(interest)
except:
pass

# get accomplishment
try:
_ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
EC.presence_of_element_located(
(
By.XPATH,
"//*[@class='pv-profile-section pv-accomplishments-section artdeco-container-card artdeco-card ember-view']",
)
)
)
acc = driver.find_element(By.XPATH,
"//*[@class='pv-profile-section pv-accomplishments-section artdeco-container-card artdeco-card ember-view']"
)
for block in acc.find_elements(By.XPATH,
"//div[@class='pv-accomplishments-block__content break-words']"
):
category = block.find_element(By.TAG_NAME, "h3")
for title in block.find_element(By.TAG_NAME,
"ul"
).find_elements(By.TAG_NAME, "li"):
accomplishment = Accomplishment(category.text, title.text)
self.add_accomplishment(accomplishment)
except:
pass

# get connections
try:
driver.get("https://www.linkedin.com/mynetwork/invite-connect/connections/")
_ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
EC.presence_of_element_located((By.CLASS_NAME, "mn-connections"))
)
connections = driver.find_element(By.CLASS_NAME, "mn-connections")
if connections is not None:
for conn in connections.find_elements(By.CLASS_NAME, "mn-connection-card"):
anchor = conn.find_element(By.CLASS_NAME, "mn-connection-card__link")
url = anchor.get_attribute("href")
name = conn.find_element(By.CLASS_NAME, "mn-connection-card__details").find_element(By.CLASS_NAME, "mn-connection-card__name").text.strip()
occupation = conn.find_element(By.CLASS_NAME, "mn-connection-card__details").find_element(By.CLASS_NAME, "mn-connection-card__occupation").text.strip()

contact = Contact(name=name, occupation=occupation, url=url)
self.add_contact(contact)
except:
connections = None

if close_on_complete:
driver.quit()

Expand Down
35 changes: 31 additions & 4 deletions samples/scrape_person.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,36 @@
import os
from linkedin_scraper import Person, actions
from linkedin_scraper import Person, actions, Company
from selenium import webdriver
driver = webdriver.Chrome("./chromedriver")

driver = webdriver.Chrome()

email = os.getenv("LINKEDIN_USER")
password = os.getenv("LINKEDIN_PASSWORD")
actions.login(driver, email, password) # if email and password isnt given, it'll prompt in terminal
person = Person("https://www.linkedin.com/in/andre-iguodala-65b48ab5", driver=driver)
actions.login(driver, email, password) # if email and password isnt given, it'll prompt in terminal
user_input = []
urls = []
while True:
user_input = input("Enter a comma-separated list of linkedin urls: ")
if user_input == "exit":
break
urls = user_input.split(",")
results = []
for url in urls:
print(f'scraping {url}')
person = Person(url, driver=driver, close_on_complete=False)
company = Company(person.experiences[0].linkedin_url, get_employees=False, driver=driver, close_on_complete=False)
results.append((person, company))

print('RESULTS:')
print('name,location,exp_title,exp_company,exp_linkedin,company_industry,company_website,company_size')
for person, company in results:
experience = person.experiences[0]
print(f'"{person.name}", '
f'"{person.location}", '
f'"{experience.position_title}", '
f'"{experience.institution_name}", '
f'"{experience.linkedin_url}", '
f'"{company.industry}", '
f'"{company.website}", '
f'"{company.company_size}", '
)