Skip to content

Commit

Permalink
enh: potential email (#5)
Browse files Browse the repository at this point in the history
  • Loading branch information
cullenwatson authored Jun 5, 2024
1 parent 25e605c commit 72ef2ee
Show file tree
Hide file tree
Showing 5 changed files with 61 additions and 5 deletions.
37 changes: 36 additions & 1 deletion poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 2 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "staffspy"
version = "0.1.4"
version = "0.1.5"
description = "Staff scraper library for LinkedIn"
authors = ["Cullen Watson <cullen@bunsly.com>"]
readme = "README.md"
Expand All @@ -11,6 +11,7 @@ selenium = "^4.21.0"
pydantic = "^2.7.2"
pandas = "^2.2.2"
requests = "^2.32.3"
tldextract = "^5.1.2"


[tool.poetry.group.dev.dependencies]
Expand Down
9 changes: 7 additions & 2 deletions staffspy/linkedin/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ class LinkedInScraper:
def __init__(self, session_file):
self.session = utils.load_session(session_file)
self.company_id = self.staff_count = self.num_staff = self.company_name = (
self.max_results
) = self.search_term = None
self.domain
) = self.max_results = self.search_term = None

def get_company_id(self, company_name):
res = self.session.get(f"{self.company_id_ep}{company_name}")
Expand All @@ -40,6 +40,7 @@ def get_company_id(self, company_name):
logger.debug(res.text[:200])
sys.exit()
company = response_json["elements"][0]
self.domain = utils.extract_base_domain(company["companyPageUrl"])
staff_count = company["staffCount"]
company_id = company["trackingInfo"]["objectUrn"].split(":")[-1]
logger.info(f"Found company {company_name} with {staff_count} staff")
Expand Down Expand Up @@ -95,6 +96,10 @@ def parse_emp(self, emp, emp_dict):
emp.profile_photo = profile_photo
emp.first_name = emp_dict["firstName"]
emp.last_name = emp_dict["lastName"]
emp.potential_email = utils.create_email(
emp.first_name, emp.last_name, self.domain
)

emp.followers = emp_dict.get("followingState", {}).get("followerCount")
emp.connections = emp_dict["connections"]["paging"]["total"]
emp.location = emp_dict["geoLocation"]["geo"]["defaultLocalizedName"]
Expand Down
4 changes: 3 additions & 1 deletion staffspy/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class Skill(BaseModel):
def to_dict(self):
return {
"name": self.name,
"endorsements": self.endorsements,
"endorsements": self.endorsements if self.endorsements else 0,
}


Expand Down Expand Up @@ -75,6 +75,7 @@ class Staff(BaseModel):
profile_link: str | None = None
first_name: str | None = None
last_name: str | None = None
potential_email: str | None = None
followers: int | None = None
connections: int | None = None
location: str | None = None
Expand All @@ -98,6 +99,7 @@ def to_dict(self):
"profile_id": self.profile_id,
"first_name": self.first_name,
"last_name": self.last_name,
"potential_email": self.potential_email,
"company": self.company,
"school": self.school,
"location": self.location,
Expand Down
13 changes: 13 additions & 0 deletions staffspy/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from datetime import datetime

import requests
import tldextract
from selenium import webdriver
from selenium.common.exceptions import WebDriverException

Expand All @@ -26,6 +27,18 @@ def set_csrf_token(session):
return session


def extract_base_domain(url: str):
extracted = tldextract.extract(url)
base_domain = "{}.{}".format(extracted.domain, extracted.suffix)
return base_domain


def create_email(first, last, domain):
first = "".join(filter(str.isalpha, first))
last = "".join(filter(str.isalpha, last))
return f"{first.lower()}.{last.lower()}@{domain}"


def get_webdriver():
for browser in [webdriver.Firefox, webdriver.Chrome]:
try:
Expand Down

0 comments on commit 72ef2ee

Please sign in to comment.