Merge pull request #220 from axblueblader/master

joeyism · web-flow · commit 8bd209fda333 · 2024-09-19T23:51:45.000+02:00
[Usable] fixed person.py working as of 17 May
diff --git a/README.md b/README.md
@@ -1,4 +1,5 @@
 # Linkedin Scraper
+Forked from https://github.com/joeyism/linkedin_scraper
 
 Scrapes Linkedin User Data
 
@@ -59,21 +60,6 @@ First, you must set your chromedriver location by
 export CHROMEDRIVER=~/chromedriver
 ```
 
-## Sponsor
-[![rds-cost](https://raw.githubusercontent.com/joeyism/linkedin_scraper/master/docs/proxycurl.png)](https://nubela.co/proxycurl/?utm_campaign=influencer%20marketing&utm_source=github&utm_medium=social&utm_term=-&utm_content=joeyism)
-
-Scrape public LinkedIn profile data at scale with [Proxycurl APIs](https://nubela.co/proxycurl/?utm_campaign=influencer%20marketing&utm_source=github&utm_medium=social&utm_term=-&utm_content=joeyism).
-
-• Scraping Public profiles are battle tested in court in HiQ VS LinkedIn case.<br/>
-• GDPR, CCPA, SOC2 compliant<br/>
-• High rate limit - 300 requests/minute<br/>
-• Fast - APIs respond in ~2s<br/>
-• Fresh data - 88% of data is scraped real-time, other 12% are not older than 29 days<br/>
-• High accuracy<br/>
-• Tons of data points returned per profile
-
-Built for developers, by developers.
-
 ## Usage
 To use it, just create the class.
 
@@ -283,7 +269,3 @@ company = Company("https://ca.linkedin.com/company/google", driver=driver)
 
 #### `scrape(close_on_complete=True)`
 This is the meat of the code, where execution of this function scrapes the company. If *close_on_complete* is True (which it is by default), then the browser will close upon completion. If scraping of other companies are desired, then you might want to set that to false so you can keep using the same driver.
-
-## Contribution
-
-<a href="https://www.buymeacoffee.com/joeyism" target="_blank"><img src="https://www.buymeacoffee.com/assets/img/custom_images/orange_img.png" alt="Buy Me A Coffee" style="height: 41px !important;width: 174px !important;box-shadow: 0px 3px 2px 0px rgba(190, 190, 190, 0.5) !important;-webkit-box-shadow: 0px 3px 2px 0px rgba(190, 190, 190, 0.5) !important;" ></a>
diff --git a/linkedin_scraper/company.py b/linkedin_scraper/company.py
@@ -184,11 +184,11 @@ def scrape_logged_in(self, get_employees = True, close_on_complete = True):
 
         driver.get(self.linkedin_url)
 
-        _ = WebDriverWait(driver, 3).until(EC.presence_of_all_elements_located((By.XPATH, '//span[@dir="ltr"]')))
+        _ = WebDriverWait(driver, 3).until(EC.presence_of_all_elements_located((By.XPATH, '//div[@dir="ltr"]')))
 
         navigation = driver.find_element(By.CLASS_NAME, "org-page-navigation__items ")
 
-        self.name = driver.find_element(By.XPATH,'//span[@dir="ltr"]').text.strip()
+        self.name = driver.find_element(By.CLASS_NAME,"org-top-card-summary__title").text.strip()
 
         # Click About Tab or View All Link
         try:
@@ -360,6 +360,6 @@ def __repr__(self):
         _output['affiliated_companies'] = self.affiliated_companies
         _output['employees'] = self.employees
         _output['headcount'] = self.headcount
-        
+
         return json.dumps(_output).replace('\n', '')
 
diff --git a/linkedin_scraper/person.py b/linkedin_scraper/person.py
@@ -1,3 +1,5 @@
+import time
+
 import requests
 from selenium import webdriver
 from selenium.webdriver.common.by import By
@@ -11,7 +13,7 @@
 
 class Person(Scraper):
 
-    __TOP_CARD = "pv-top-card"
+    __TOP_CARD = "scaffold-layout__main"
     __WAIT_FOR_ELEMENT_TIMEOUT = 5
 
     def __init__(
@@ -113,13 +115,15 @@ def get_experiences(self):
         main = self.wait_for_element_to_load(by=By.TAG_NAME, name="main")
         self.scroll_to_half()
         self.scroll_to_bottom()
-        main_list = self.wait_for_element_to_load(name="pvs-list", base=main)
-        for position in main_list.find_elements(By.XPATH,"li"):
-            position = position.find_element(By.CLASS_NAME,"pvs-entity")
-            company_logo_elem, position_details = position.find_elements(By.XPATH,"*")
+        main_list = self.wait_for_element_to_load(name="pvs-list__container", base=main)
+        for position in main_list.find_elements(By.CLASS_NAME, "pvs-list__paged-list-item"):
+            position = position.find_element(By.CSS_SELECTOR, "div[data-view-name='profile-component-entity']")
+            company_logo_elem, position_details = position.find_elements(By.XPATH, "*")
 
             # company elem
             company_linkedin_url = company_logo_elem.find_element(By.XPATH,"*").get_attribute("href")
+            if not company_linkedin_url:
+                continue
 
             # position details
             position_details_list = position_details.find_elements(By.XPATH,"*")
@@ -143,15 +147,26 @@ def get_experiences(self):
                     company = outer_positions[0].find_element(By.TAG_NAME,"span").text
                     work_times = outer_positions[1].find_element(By.TAG_NAME,"span").text
                     location = outer_positions[2].find_element(By.TAG_NAME,"span").text
+            else:
+                position_title = ""
+                company = outer_positions[0].find_element(By.TAG_NAME,"span").text
+                work_times = ""
+                location = ""
+
 
             times = work_times.split("·")[0].strip() if work_times else ""
             duration = work_times.split("·")[1].strip() if len(work_times.split("·")) > 1 else None
 
             from_date = " ".join(times.split(" ")[:2]) if times else ""
             to_date = " ".join(times.split(" ")[3:]) if times else ""
-
-            if position_summary_text and len(position_summary_text.find_element(By.CLASS_NAME,"pvs-list").find_element(By.CLASS_NAME,"pvs-list").find_elements(By.XPATH,"li")) > 1:
-                descriptions = position_summary_text.find_element(By.CLASS_NAME,"pvs-list").find_element(By.CLASS_NAME,"pvs-list").find_elements(By.XPATH,"li")
+            if position_summary_text and any(element.get_attribute("pvs-list__container") for element in position_summary_text.find_elements(By.TAG_NAME, "*")):
+                inner_positions = (position_summary_text.find_element(By.CLASS_NAME,"pvs-list__container")
+                                  .find_element(By.XPATH,"*").find_element(By.XPATH,"*").find_element(By.XPATH,"*")
+                                  .find_elements(By.CLASS_NAME,"pvs-list__paged-list-item"))
+            else:
+                inner_positions = []
+            if len(inner_positions) > 1:
+                descriptions = inner_positions
                 for description in descriptions:
                     res = description.find_element(By.TAG_NAME,"a").find_elements(By.XPATH,"*")
                     position_title_elem = res[0] if len(res) > 0 else None
@@ -200,8 +215,9 @@ def get_educations(self):
         main = self.wait_for_element_to_load(by=By.TAG_NAME, name="main")
         self.scroll_to_half()
         self.scroll_to_bottom()
-        main_list = self.wait_for_element_to_load(name="pvs-list", base=main)
-        for position in main_list.find_elements(By.CLASS_NAME,"pvs-entity"):
+        main_list = self.wait_for_element_to_load(name="pvs-list__container", base=main)
+        for position in main_list.find_elements(By.CLASS_NAME,"pvs-list__paged-list-item"):
+            position = position.find_element(By.XPATH,"//div[@data-view-name='profile-component-entity']")
             institution_logo_elem, position_details = position.find_elements(By.XPATH,"*")
 
             # company elem
@@ -214,13 +230,17 @@ def get_educations(self):
             outer_positions = position_summary_details.find_element(By.XPATH,"*").find_elements(By.XPATH,"*")
 
             institution_name = outer_positions[0].find_element(By.TAG_NAME,"span").text
-            degree = outer_positions[1].find_element(By.TAG_NAME,"span").text
+            if len(outer_positions) > 1:
+                degree = outer_positions[1].find_element(By.TAG_NAME,"span").text
+            else:
+                degree = None
 
             if len(outer_positions) > 2:
                 times = outer_positions[2].find_element(By.TAG_NAME,"span").text
 
-                from_date = " ".join(times.split(" ")[:2])
-                to_date = " ".join(times.split(" ")[3:])
+                if times != "":
+                    from_date = times.split(" ")[times.split(" ").index("-")-1] if len(times.split(" "))>3 else times.split(" ")[0]
+                    to_date = times.split(" ")[-1]
             else:
                 from_date = None
                 to_date = None
@@ -240,10 +260,9 @@ def get_educations(self):
             self.add_education(education)
 
     def get_name_and_location(self):
-        top_panels = self.driver.find_elements(By.CLASS_NAME,"pv-text-details__left-panel")
-        self.name = top_panels[0].find_elements(By.XPATH,"*")[0].text
-        self.location = top_panels[1].find_element(By.TAG_NAME,"span").text
-
+        top_panel = self.driver.find_element(By.XPATH, "//*[@class='mt2 relative']")
+        self.name = top_panel.find_element(By.TAG_NAME, "h1").text
+        self.location = top_panel.find_element(By.XPATH, "//*[@class='text-body-small inline t-black--light break-words']").text
 
     def get_about(self):
         try:
@@ -288,75 +307,6 @@ def scrape_logged_in(self, close_on_complete=True):
         self.get_educations()
 
         driver.get(self.linkedin_url)
-
-        # get interest
-        try:
-
-            _ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
-                EC.presence_of_element_located(
-                    (
-                        By.XPATH,
-                        "//*[@class='pv-profile-section pv-interests-section artdeco-container-card artdeco-card ember-view']",
-                    )
-                )
-            )
-            interestContainer = driver.find_element(By.XPATH,
-                "//*[@class='pv-profile-section pv-interests-section artdeco-container-card artdeco-card ember-view']"
-            )
-            for interestElement in interestContainer.find_elements(By.XPATH, 
-                "//*[@class='pv-interest-entity pv-profile-section__card-item ember-view']"
-            ):
-                interest = Interest(
-                    interestElement.find_element(By.TAG_NAME, "h3").text.strip()
-                )
-                self.add_interest(interest)
-        except:
-            pass
-
-        # get accomplishment
-        try:
-            _ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
-                EC.presence_of_element_located(
-                    (
-                        By.XPATH,
-                        "//*[@class='pv-profile-section pv-accomplishments-section artdeco-container-card artdeco-card ember-view']",
-                    )
-                )
-            )
-            acc = driver.find_element(By.XPATH,
-                "//*[@class='pv-profile-section pv-accomplishments-section artdeco-container-card artdeco-card ember-view']"
-            )
-            for block in acc.find_elements(By.XPATH, 
-                "//div[@class='pv-accomplishments-block__content break-words']"
-            ):
-                category = block.find_element(By.TAG_NAME, "h3")
-                for title in block.find_element(By.TAG_NAME, 
-                    "ul"
-                ).find_elements(By.TAG_NAME, "li"):
-                    accomplishment = Accomplishment(category.text, title.text)
-                    self.add_accomplishment(accomplishment)
-        except:
-            pass
-
-        # get connections
-        try:
-            driver.get("https://www.linkedin.com/mynetwork/invite-connect/connections/")
-            _ = WebDriverWait(driver, self.__WAIT_FOR_ELEMENT_TIMEOUT).until(
-                EC.presence_of_element_located((By.CLASS_NAME, "mn-connections"))
-            )
-            connections = driver.find_element(By.CLASS_NAME, "mn-connections")
-            if connections is not None:
-                for conn in connections.find_elements(By.CLASS_NAME, "mn-connection-card"):
-                    anchor = conn.find_element(By.CLASS_NAME, "mn-connection-card__link")
-                    url = anchor.get_attribute("href")
-                    name = conn.find_element(By.CLASS_NAME, "mn-connection-card__details").find_element(By.CLASS_NAME, "mn-connection-card__name").text.strip()
-                    occupation = conn.find_element(By.CLASS_NAME, "mn-connection-card__details").find_element(By.CLASS_NAME, "mn-connection-card__occupation").text.strip()
-
-                    contact = Contact(name=name, occupation=occupation, url=url)
-                    self.add_contact(contact)
-        except:
-            connections = None
-
         if close_on_complete:
             driver.quit()
 
diff --git a/samples/scrape_person.py b/samples/scrape_person.py
@@ -1,9 +1,36 @@
 import os
-from linkedin_scraper import Person, actions
+from linkedin_scraper import Person, actions, Company
 from selenium import webdriver
-driver = webdriver.Chrome("./chromedriver")
+
+driver = webdriver.Chrome()
 
 email = os.getenv("LINKEDIN_USER")
 password = os.getenv("LINKEDIN_PASSWORD")
-actions.login(driver, email, password) # if email and password isnt given, it'll prompt in terminal
-person = Person("https://www.linkedin.com/in/andre-iguodala-65b48ab5", driver=driver)
+actions.login(driver, email, password)  # if email and password isnt given, it'll prompt in terminal
+user_input = []
+urls = []
+while True:
+    user_input = input("Enter a comma-separated list of linkedin urls: ")
+    if user_input == "exit":
+        break
+    urls = user_input.split(",")
+    results = []
+    for url in urls:
+        print(f'scraping {url}')
+        person = Person(url,  driver=driver, close_on_complete=False)
+        company = Company(person.experiences[0].linkedin_url, get_employees=False, driver=driver, close_on_complete=False)
+        results.append((person, company))
+
+    print('RESULTS:')
+    print('name,location,exp_title,exp_company,exp_linkedin,company_industry,company_website,company_size')
+    for person, company in results:
+        experience = person.experiences[0]
+        print(f'"{person.name}", '
+              f'"{person.location}", '
+              f'"{experience.position_title}", '
+              f'"{experience.institution_name}", '
+              f'"{experience.linkedin_url}", '
+              f'"{company.industry}", '
+              f'"{company.website}", '
+              f'"{company.company_size}", '
+              )