diff --git a/README.md b/README.md
index d8f3988..a1ef7d3 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,5 @@
# Linkedin Scraper
+Forked from https://github.com/joeyism/linkedin_scraper
Scrapes Linkedin User Data
@@ -59,21 +60,6 @@ First, you must set your chromedriver location by
export CHROMEDRIVER=~/chromedriver
```
-## Sponsor
-[](https://nubela.co/proxycurl/?utm_campaign=influencer%20marketing&utm_source=github&utm_medium=social&utm_term=-&utm_content=joeyism)
-
-Scrape public LinkedIn profile data at scale with [Proxycurl APIs](https://nubela.co/proxycurl/?utm_campaign=influencer%20marketing&utm_source=github&utm_medium=social&utm_term=-&utm_content=joeyism).
-
-• Scraping Public profiles are battle tested in court in HiQ VS LinkedIn case.
-• GDPR, CCPA, SOC2 compliant
-• High rate limit - 300 requests/minute
-• Fast - APIs respond in ~2s
-• Fresh data - 88% of data is scraped real-time, other 12% are not older than 29 days
-• High accuracy
-• Tons of data points returned per profile
-
-Built for developers, by developers.
-
## Usage
To use it, just create the class.
@@ -283,7 +269,3 @@ company = Company("https://ca.linkedin.com/company/google", driver=driver)
#### `scrape(close_on_complete=True)`
This is the meat of the code, where execution of this function scrapes the company. If *close_on_complete* is True (which it is by default), then the browser will close upon completion. If scraping of other companies are desired, then you might want to set that to false so you can keep using the same driver.
-
-## Contribution
-
-
diff --git a/linkedin_scraper/company.py b/linkedin_scraper/company.py
index 77900eb..9293597 100644
--- a/linkedin_scraper/company.py
+++ b/linkedin_scraper/company.py
@@ -184,11 +184,11 @@ def scrape_logged_in(self, get_employees = True, close_on_complete = True):
driver.get(self.linkedin_url)
- _ = WebDriverWait(driver, 3).until(EC.presence_of_all_elements_located((By.XPATH, '//span[@dir="ltr"]')))
+ _ = WebDriverWait(driver, 3).until(EC.presence_of_all_elements_located((By.XPATH, '//div[@dir="ltr"]')))
navigation = driver.find_element(By.CLASS_NAME, "org-page-navigation__items ")
- self.name = driver.find_element(By.XPATH,'//span[@dir="ltr"]').text.strip()
+ self.name = driver.find_element(By.CLASS_NAME,"org-top-card-summary__title").text.strip()
# Click About Tab or View All Link
try:
@@ -360,6 +360,6 @@ def __repr__(self):
_output['affiliated_companies'] = self.affiliated_companies
_output['employees'] = self.employees
_output['headcount'] = self.headcount
-
+
return json.dumps(_output).replace('\n', '')
diff --git a/linkedin_scraper/person.py b/linkedin_scraper/person.py
index 86d169e..9c83217 100644
--- a/linkedin_scraper/person.py
+++ b/linkedin_scraper/person.py
@@ -1,3 +1,5 @@
+import time
+
import requests
from selenium import webdriver
from selenium.webdriver.common.by import By
@@ -11,7 +13,7 @@
class Person(Scraper):
- __TOP_CARD = "pv-top-card"
+ __TOP_CARD = "scaffold-layout__main"
__WAIT_FOR_ELEMENT_TIMEOUT = 5
def __init__(
@@ -113,13 +115,15 @@ def get_experiences(self):
main = self.wait_for_element_to_load(by=By.TAG_NAME, name="main")
self.scroll_to_half()
self.scroll_to_bottom()
- main_list = self.wait_for_element_to_load(name="pvs-list", base=main)
- for position in main_list.find_elements(By.XPATH,"li"):
- position = position.find_element(By.CLASS_NAME,"pvs-entity")
- company_logo_elem, position_details = position.find_elements(By.XPATH,"*")
+ main_list = self.wait_for_element_to_load(name="pvs-list__container", base=main)
+ for position in main_list.find_elements(By.CLASS_NAME, "pvs-list__paged-list-item"):
+ position = position.find_element(By.CSS_SELECTOR, "div[data-view-name='profile-component-entity']")
+ company_logo_elem, position_details = position.find_elements(By.XPATH, "*")
# company elem
company_linkedin_url = company_logo_elem.find_element(By.XPATH,"*").get_attribute("href")
+ if not company_linkedin_url:
+ continue
# position details
position_details_list = position_details.find_elements(By.XPATH,"*")
@@ -143,15 +147,26 @@ def get_experiences(self):
company = outer_positions[0].find_element(By.TAG_NAME,"span").text
work_times = outer_positions[1].find_element(By.TAG_NAME,"span").text
location = outer_positions[2].find_element(By.TAG_NAME,"span").text
+ else:
+ position_title = ""
+ company = outer_positions[0].find_element(By.TAG_NAME,"span").text
+ work_times = ""
+ location = ""
+
times = work_times.split("·")[0].strip() if work_times else ""
duration = work_times.split("·")[1].strip() if len(work_times.split("·")) > 1 else None
from_date = " ".join(times.split(" ")[:2]) if times else ""
to_date = " ".join(times.split(" ")[3:]) if times else ""
-
- if position_summary_text and len(position_summary_text.find_element(By.CLASS_NAME,"pvs-list").find_element(By.CLASS_NAME,"pvs-list").find_elements(By.XPATH,"li")) > 1:
- descriptions = position_summary_text.find_element(By.CLASS_NAME,"pvs-list").find_element(By.CLASS_NAME,"pvs-list").find_elements(By.XPATH,"li")
+ if position_summary_text and any(element.get_attribute("pvs-list__container") for element in position_summary_text.find_elements(By.TAG_NAME, "*")):
+ inner_positions = (position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container")
+ .find_element(By.XPATH,"*").find_element(By.XPATH,"*").find_element(By.XPATH,"*")
+ .find_elements(By.CLASS_NAME,"pvs-list__paged-list-item"))
+ else:
+ inner_positions = []
+ if len(inner_positions) > 1:
+ descriptions = inner_positions
for description in descriptions:
res = description.find_element(By.TAG_NAME,"a").find_elements(By.XPATH,"*")
position_title_elem = res[0] if len(res) > 0 else None
@@ -200,8 +215,9 @@ def get_educations(self):
main = self.wait_for_element_to_load(by=By.TAG_NAME, name="main")
self.scroll_to_half()
self.scroll_to_bottom()
- main_list = self.wait_for_element_to_load(name="pvs-list", base=main)
- for position in main_list.find_elements(By.CLASS_NAME,"pvs-entity"):
+ main_list = self.wait_for_element_to_load(name="pvs-list__container", base=main)
+ for position in main_list.find_elements(By.CLASS_NAME,"pvs-list__paged-list-item"):
+ position = position.find_element(By.XPATH,"//div[@data-view-name='profile-component-entity']")
institution_logo_elem, position_details = position.find_elements(By.XPATH,"*")
# company elem
@@ -214,13 +230,17 @@ def get_educations(self):
outer_positions = position_summary_details.find_element(By.XPATH,"*").find_elements(By.XPATH,"*")
institution_name = outer_positions[0].find_element(By.TAG_NAME,"span").text
- degree = outer_positions[1].find_element(By.TAG_NAME,"span").text
+ if len(outer_positions) > 1:
+ degree = outer_positions[1].find_element(By.TAG_NAME,"span").text
+ else:
+ degree = None
if len(outer_positions) > 2:
times = outer_positions[2].find_element(By.TAG_NAME,"span").text
- from_date = " ".join(times.split(" ")[:2])
- to_date = " ".join(times.split(" ")[3:])
+ if times != "":
+ from_date = times.split(" ")[times.split(" ").index("-")-1] if len(times.split(" "))>3 else times.split(" ")[0]
+ to_date = times.split(" ")[-1]
else:
from_date = None
to_date = None
@@ -240,10 +260,9 @@ def get_educations(self):
self.add_education(education)
def get_name_and_location(self):
- top_panels = self.driver.find_elements(By.CLASS_NAME,"pv-text-details__left-panel")
- self.name = top_panels[0].find_elements(By.XPATH,"*")[0].text
- self.location = top_panels[1].find_element(By.TAG_NAME,"span").text
-
+ top_panel = self.driver.find_element(By.XPATH, "//*[@class='mt2 relative']")
+ self.name = top_panel.find_element(By.TAG_NAME, "h1").text
+ self.location = top_panel.find_element(By.XPATH, "//*[@class='text-body-small inline t-black--light break-words']").text
def get_about(self):
try:
@@ -288,75 +307,6 @@ def scrape_logged_in(self, close_on_complete=True):
self.get_educations()
driver.get(self.linkedin_url)
-
- # get interest
- try:
-
- _ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
- EC.presence_of_element_located(
- (
- By.XPATH,
- "//*[@class='pv-profile-section pv-interests-section artdeco-container-card artdeco-card ember-view']",
- )
- )
- )
- interestContainer = driver.find_element(By.XPATH,
- "//*[@class='pv-profile-section pv-interests-section artdeco-container-card artdeco-card ember-view']"
- )
- for interestElement in interestContainer.find_elements(By.XPATH,
- "//*[@class='pv-interest-entity pv-profile-section__card-item ember-view']"
- ):
- interest = Interest(
- interestElement.find_element(By.TAG_NAME, "h3").text.strip()
- )
- self.add_interest(interest)
- except:
- pass
-
- # get accomplishment
- try:
- _ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
- EC.presence_of_element_located(
- (
- By.XPATH,
- "//*[@class='pv-profile-section pv-accomplishments-section artdeco-container-card artdeco-card ember-view']",
- )
- )
- )
- acc = driver.find_element(By.XPATH,
- "//*[@class='pv-profile-section pv-accomplishments-section artdeco-container-card artdeco-card ember-view']"
- )
- for block in acc.find_elements(By.XPATH,
- "//div[@class='pv-accomplishments-block__content break-words']"
- ):
- category = block.find_element(By.TAG_NAME, "h3")
- for title in block.find_element(By.TAG_NAME,
- "ul"
- ).find_elements(By.TAG_NAME, "li"):
- accomplishment = Accomplishment(category.text, title.text)
- self.add_accomplishment(accomplishment)
- except:
- pass
-
- # get connections
- try:
- driver.get("https://www.linkedin.com/mynetwork/invite-connect/connections/")
- _ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
- EC.presence_of_element_located((By.CLASS_NAME, "mn-connections"))
- )
- connections = driver.find_element(By.CLASS_NAME, "mn-connections")
- if connections is not None:
- for conn in connections.find_elements(By.CLASS_NAME, "mn-connection-card"):
- anchor = conn.find_element(By.CLASS_NAME, "mn-connection-card__link")
- url = anchor.get_attribute("href")
- name = conn.find_element(By.CLASS_NAME, "mn-connection-card__details").find_element(By.CLASS_NAME, "mn-connection-card__name").text.strip()
- occupation = conn.find_element(By.CLASS_NAME, "mn-connection-card__details").find_element(By.CLASS_NAME, "mn-connection-card__occupation").text.strip()
-
- contact = Contact(name=name, occupation=occupation, url=url)
- self.add_contact(contact)
- except:
- connections = None
-
if close_on_complete:
driver.quit()
diff --git a/samples/scrape_person.py b/samples/scrape_person.py
index 7d4e93f..c34f70e 100644
--- a/samples/scrape_person.py
+++ b/samples/scrape_person.py
@@ -1,9 +1,36 @@
import os
-from linkedin_scraper import Person, actions
+from linkedin_scraper import Person, actions, Company
from selenium import webdriver
-driver = webdriver.Chrome("./chromedriver")
+
+driver = webdriver.Chrome()
email = os.getenv("LINKEDIN_USER")
password = os.getenv("LINKEDIN_PASSWORD")
-actions.login(driver, email, password) # if email and password isnt given, it'll prompt in terminal
-person = Person("https://www.linkedin.com/in/andre-iguodala-65b48ab5", driver=driver)
+actions.login(driver, email, password) # if email and password isnt given, it'll prompt in terminal
+user_input = []
+urls = []
+while True:
+ user_input = input("Enter a comma-separated list of linkedin urls: ")
+ if user_input == "exit":
+ break
+ urls = user_input.split(",")
+ results = []
+ for url in urls:
+ print(f'scraping {url}')
+ person = Person(url, driver=driver, close_on_complete=False)
+ company = Company(person.experiences[0].linkedin_url, get_employees=False, driver=driver, close_on_complete=False)
+ results.append((person, company))
+
+ print('RESULTS:')
+ print('name,location,exp_title,exp_company,exp_linkedin,company_industry,company_website,company_size')
+ for person, company in results:
+ experience = person.experiences[0]
+ print(f'"{person.name}", '
+ f'"{person.location}", '
+ f'"{experience.position_title}", '
+ f'"{experience.institution_name}", '
+ f'"{experience.linkedin_url}", '
+ f'"{company.industry}", '
+ f'"{company.website}", '
+ f'"{company.company_size}", '
+ )