diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..e1699fa --- /dev/null +++ b/.gitignore @@ -0,0 +1,209 @@ +# Created by .ignore support plugin (hsz.mobi) +### Python template +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# celery beat schedule file +celerybeat-schedule + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +### JetBrains template +# Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and WebStorm +# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 + +# User-specific stuff +.idea/**/workspace.xml +.idea/**/tasks.xml +.idea/**/usage.statistics.xml +.idea/**/dictionaries +.idea/**/shelf + +# Generated files +.idea/**/contentModel.xml + +# Sensitive or high-churn files +.idea/**/dataSources/ +.idea/**/dataSources.ids +.idea/**/dataSources.local.xml +.idea/**/sqlDataSources.xml +.idea/**/dynamic.xml +.idea/**/uiDesigner.xml +.idea/**/dbnavigator.xml + +# Gradle +.idea/**/gradle.xml +.idea/**/libraries + +# Gradle and Maven with auto-import +# When using Gradle or Maven with auto-import, you should exclude module files, +# since they will be recreated, and may cause churn. Uncomment if using +# auto-import. +# .idea/modules.xml +# .idea/*.iml +# .idea/modules +# *.iml +# *.ipr + +# CMake +cmake-build-*/ + +# Mongo Explorer plugin +.idea/**/mongoSettings.xml + +# File-based project format +*.iws + +# IntelliJ +out/ + +# mpeltonen/sbt-idea plugin +.idea_modules/ + +# JIRA plugin +atlassian-ide-plugin.xml + +# Cursive Clojure plugin +.idea/replstate.xml + +# Crashlytics plugin (for Android Studio and IntelliJ) +com_crashlytics_export_strings.xml +crashlytics.properties +crashlytics-build.properties +fabric.properties + +# Editor-based Rest Client +.idea/httpRequests + +# Android studio 3.1+ serialized cache file +.idea/caches/build_file_checksums.ser + +### Example user template template +### Example user template + +# IntelliJ project files +.idea +*.iml +out +gen + +# Custom +*.secret +chromedriver diff --git a/README.md b/README.md index 1b72e05..586eeb6 100644 --- a/README.md +++ b/README.md @@ -1 +1,76 @@ -# linkedin_scraper \ No newline at end of file +# ScrapIn +This module is a selenium wrapper tailored to searching people or jobs on linkedin given a set of criteria. + +**WARNING:** +While supposedly legal to scrape linkedin (see this [legal proceeding](https://www.reuters.com/article/us-microsoft-linkedin-ruling-idUSKCN1AU2BV?feedType=RSS&feedName=technologyNews)), I do not take any responsibility for any retaliation Linkedin might enact against the user-account associated with the use of this package. + + +### System + + +As of: 2019-09-20 + +CPython 3.7.3 + +numpy 1.16.4\ +pandas 0.24.2\ +selenium 3.141.0\ +bs4 4.7.1 + +compiler : GCC 7.3.0\ +system : Linux\ +release : 5.0.0-29-generic\ +machine : x86_64\ +processor : x86_64\ +CPU cores : 12\ +interpreter: 64bit + +### Setup +To get started, run the following commands: + +``` +pip install --upgrade pip +git clone git@github.com:maximemerabet/scrapIn.git +cd scrapIn +python3 -m venv .env +source .env/bin/activate +pip install -r requirements.txt +``` + +You will also need to download the chromedriver corresponding to your OS and chrome version.\ +These can be found [here](https://sites.google.com/a/chromium.org/chromedriver/downloads). +Once downloaded, please move the executable to `scrapIn/.` and run the following command: `chmod +x chromedriver` + +(Please note that you can use selenium's recommended driver location but will need to overwrite the chromedriver path in `config.py` or overwrite the parameter in `scraper.Authenticator.login`) + + + + +### How to use: +The following is an example code for a people-specific search (notebook demo can be found [here](../examples/search_people.ipynb)) + +``` +# Append to python path and import +from src.scraper import Authenticator, Scraper + +# Credential prompt +auth = Authenticator() + +# Instantiate scraper object +scraper = Scraper(authenticator=auth) + +# Execute search (returns DataFrame) +people_search = scraper.search_people(job_title='Executive Chef', + location=['London, 'Manchester'], + industry='Hospitality') + +``` + + +### TODO +- Add deep-search functionality (Given a list of profile URL, retrieve more detailed information) +- Write up `test_config` to test for the presence of required elements on the linkedin web-page +- (Optional) Add functionality to execute other types of search (i.e. jobs, etc.) + + + diff --git a/examples/search_people.ipynb b/examples/search_people.ipynb new file mode 100644 index 0000000..dbe421c --- /dev/null +++ b/examples/search_people.ipynb @@ -0,0 +1,208 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "ExecuteTime": { + "end_time": "2019-10-02T08:17:29.325902Z", + "start_time": "2019-10-02T08:17:29.139310Z" + } + }, + "outputs": [], + "source": [ + "import sys\n", + "sys.path.append('../')\n", + "\n", + "from src.scraper import Authenticator, Scraper\n", + "\n", + "%load_ext autoreload\n", + "%autoreload 2" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:scraper:Loading parameters..\n", + "\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Enter username:example@gmail.com\n", + "Enter password:········\n" + ] + } + ], + "source": [ + "# Pass in list [username, password] for automated authentication\n", + "auth = Authenticator()" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:scraper:Submitting credentials and logging-in\n", + "\n" + ] + } + ], + "source": [ + "scraper = Scraper(authenticator=auth)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "scrolled": true + }, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:scraper:Passed following parameters: {'location': ['xpath', \"//input[contains(@placeholder, 'country')]\", ['London', 'Manchester']], 'industry': ['xpath', \"//input[contains(@placeholder, 'industry')]\", 'Hospitality'], 'job_title': ['xpath', \"//input[@id='search-advanced-title']\", 'Executive Chef']}\n", + "\n", + "INFO:scraper:Query returned 519 results\n", + "\n", + "INFO:scraper:Reached last searcheable page..\n", + "INFO:scraper:Retrieved profiles\n" + ] + } + ], + "source": [ + "people_search = scraper.search_people(job_title='Executive Chef',\n", + " location=['London', 'Manchester'],\n", + " industry='Hospitality')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
URLNameTitleLocation
0#LinkedIn Memberexec chef at ciborestaurantsManchester, United Kingdom
1https://www.linkedin.com/in/marcin-bialoskorsk...Marcin BialoskorskiExecutive Chef in SoLita RestaurantsManchester, United Kingdom
2https://www.linkedin.com/in/jean-philippe-rama...Jean-Philippe RamauExecutive Chef chez Gastronomica MELondon, United Kingdom
\n", + "
" + ], + "text/plain": [ + " URL Name \\\n", + "0 # LinkedIn Member \n", + "1 https://www.linkedin.com/in/marcin-bialoskorsk... Marcin Bialoskorski \n", + "2 https://www.linkedin.com/in/jean-philippe-rama... Jean-Philippe Ramau \n", + "\n", + " Title Location \n", + "0 exec chef at ciborestaurants Manchester, United Kingdom \n", + "1 Executive Chef in SoLita Restaurants Manchester, United Kingdom \n", + "2 Executive Chef chez Gastronomica ME London, United Kingdom " + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "people_search.head()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": ".venv", + "language": "python", + "name": ".venv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.3" + }, + "toc": { + "base_numbering": 1, + "nav_menu": {}, + "number_sections": true, + "sideBar": true, + "skip_h1_title": false, + "title_cell": "Table of Contents", + "title_sidebar": "Contents", + "toc_cell": false, + "toc_position": {}, + "toc_section_display": true, + "toc_window_display": false + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..ca1337d --- /dev/null +++ b/requirements.txt @@ -0,0 +1,9 @@ +beautifulsoup4==4.8.0 +numpy==1.17.1 +pandas==0.25.1 +python-dateutil==2.8.0 +pytz==2019.2 +selenium==3.141.0 +six==1.12.0 +soupsieve==1.9.3 +urllib3==1.25.3 diff --git a/src/__init__.py b/src/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/config.py b/src/config.py new file mode 100644 index 0000000..b15a009 --- /dev/null +++ b/src/config.py @@ -0,0 +1,50 @@ +import os + +# Chromedriver executable +CHROMEDRIVER_FILEPATH = os.path.join(os.path.dirname(__file__), '../chromedriver') + +# Default URLS used for search methods +SEARCH_PEOPLE_DEFAULT_URL = 'https://www.linkedin.com/people/search/' +SEARCH_JOBS_DEFAULT_URL = 'https://www.linkedin.com/jobs/search/' + + +PARAMS = { + # Login page + 'authenticate': { + 'username_field': ['name', 'session_key'], + 'password_field': ['name', 'session_password'], + 'signin_button': ['xpath', "//button[contains(text(), 'Sign in')]"] + }, + + # Search page + 'search': { + 'search_field': ['xpath', "//input[contains(@class, 'search-global-typeahead')]"], + 'all_filters_button': ['xpath', "//button[@data-control-name='all_filters']"], + 'apply_button': ['xpath', "//button[@data-control-name='all_filters_apply']"], + + 'core_rail': ['css selector', '.core-rail'], + 'search_container': ['xpath', "//ul[contains(@class, 'search-result')]"], + 'url_container': ['xpath', "//div[contains(@class, 'info')]/a[@href]"], + 'name_container': ['xpath', "li//span[contains(@class, 'actor-name')]"], + 'job_location_container': ['xpath', "//p[contains(@class, 'subline')]/span[@dir]"], + + 'next_button': ['xpath', "//button[@aria-label='Next']"] + }, + + # People-specific search page + 'search.people': { + 'default_url': SEARCH_PEOPLE_DEFAULT_URL, + 'location': ['xpath', "//input[contains(@placeholder, 'country')]"], + 'industry': ['xpath', "//input[contains(@placeholder, 'industry')]"], + 'current_company': ['xpath', "//input[contains(@placeholder, 'company')]"], + + 'job_title': ['xpath', "//input[@id='search-advanced-title']"], # text + }, + # Job-specific search page + 'search.jobs': { + 'default_url': SEARCH_JOBS_DEFAULT_URL + }, +} + + + diff --git a/src/scraper.py b/src/scraper.py new file mode 100644 index 0000000..3eac452 --- /dev/null +++ b/src/scraper.py @@ -0,0 +1,226 @@ +import logging +import time +import os +import numpy as np +import pandas as pd +from selenium import webdriver +from selenium.webdriver.common.keys import Keys +import getpass +from src import utils +from src.config import ( + CHROMEDRIVER_FILEPATH, + PARAMS, +) + +logger = logging.getLogger('scraper') +logging.basicConfig(level=logging.INFO) + + +def element_handler(func): + def wrapper(*args, **kwargs): + n_failures = 0 + element = None + while (n_failures <= 2) & (element is None): + try: + element = func(*args, **kwargs) + except Exception as e: + logger.info(f'Failed to grab element, error {e}') + time.sleep(2) + n_failures += 1 + return element + + return wrapper + + +class Loader(object): + def __init__(self, chromedriver_filepath=CHROMEDRIVER_FILEPATH, params=PARAMS): + logger.info('Loading parameters..\n') + self.authenticate_params = params['authenticate'] + self.search_params = params['search'] + self.chromedriver_filepath = chromedriver_filepath + assert os.path.isfile(self.chromedriver_filepath), 'Please add the chromedriver file to repository root' + + +class Authenticator(Loader): + def __init__(self, credentials=None): + super().__init__() + if not credentials: + self._username = input('Enter username:') + self._password = getpass.getpass('Enter password:') + else: + self._username = credentials[0] + self._password = credentials[1] + self.driver = None + + def login(self): + logger.info('Submitting credentials and logging-in\n') + self.driver = webdriver.Chrome(self.chromedriver_filepath) + self.driver.implicitly_wait(5) + self.driver.maximize_window() + self.driver.get('https://www.linkedin.com') + time.sleep(2) + + username_field = self.driver.find_element(self.authenticate_params['username_field'][0], + self.authenticate_params['username_field'][1]) + password_field = self.driver.find_element(self.authenticate_params['password_field'][0], + self.authenticate_params['password_field'][1]) + signin_button = self.driver.find_element(self.authenticate_params['signin_button'][0], + self.authenticate_params['signin_button'][1]) + + username_field.send_keys(self._username) + password_field.send_keys(self._password) + signin_button.click() + + return self.driver + + +class Scraper(object): + def __init__(self, authenticator): + assert isinstance(authenticator, Authenticator), 'Please instantiate an Authenticator object' + self.driver = authenticator.login() + self.results = [] + + @element_handler + def find_element(self, by, path): + return self.driver.find_element(by, path) + + @staticmethod + def _absolute_filter(element, value): + element.clear() + element.send_keys(value) + + def _fuzzy_filter(self, element, value): + actions = webdriver.ActionChains(self.driver) + actions.send_keys_to_element(element, value).pause(1.5).send_keys([Keys.DOWN, Keys.ENTER]) + actions.perform() + + def apply_filters(self, factory): + all_filters_button = self.find_element(factory.params['all_filters_button'][0], + factory.params['all_filters_button'][1]) + all_filters_button.click() + for param, config in factory.search_object.items(): + time.sleep(0.5) + element = self.find_element(config[0], config[1]) + if param in ['job_title']: + self._absolute_filter(element, config[2]) + else: + if isinstance(config[2], list): + for value in config[2]: + self._fuzzy_filter(element, value) + else: + self._fuzzy_filter(element, config[2]) + + apply_button = self.find_element(factory.params['apply_button'][0], factory.params['apply_button'][1]) + apply_button.click() + + @staticmethod + def _grab_urls(search_container, factory): + urls = [] + url_container = search_container.find_elements(factory.params['url_container'][0], + factory.params['url_container'][1]) + for url in url_container: + href = url.get_property('href') + if '/in/' in href: + urls.append(href) + else: + urls.append('#') + return urls + + @staticmethod + def _grab_names(search_container, factory): + name_container = search_container.find_elements(factory.params['name_container'][0], + factory.params['name_container'][1]) + names = [name.text for name in name_container] + return names + + @staticmethod + def _grab_job_location(search_container, factory): + jobs_location_container = search_container.find_elements(factory.params['job_location_container'][0], + factory.params['job_location_container'][1]) + job_locations = [i.text for i in jobs_location_container] + job_locations = np.reshape(job_locations, (np.int(len(jobs_location_container) / 2), 2)) + return job_locations + + def _parse_results(self, factory): + self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + search_container = self.find_element(factory.params['search_container'][0], + factory.params['search_container'][1]) + urls = self._grab_urls(search_container, factory) + names = self._grab_names(search_container, factory) + job_location = self._grab_job_location(search_container, factory) + current_df = pd.DataFrame(columns=['URL', 'Name', 'Title', 'Location']) + current_df['URL'] = urls + current_df['Name'] = names + current_df[['Title', 'Location']] = job_location + return current_df + + def grab_results(self, factory): + time.sleep(1) + df = pd.DataFrame(columns=['URL', 'Name', 'Title', 'Location']) + + core_rail = self.find_element(factory.params['core_rail'][0], + factory.params['core_rail'][1]) + + if core_rail.text.startswith('No results'): + logger.info(f'No results for search {factory.search_object}\n') + pass + else: + n_profiles = core_rail.text.split('\n')[0].replace('Showing', 'Query returned') + logger.info(f"{n_profiles}\n") + df = pd.concat([df, self._parse_results(factory=factory)]) + is_next_button = self.driver.find_elements(factory.params['next_button'][0], + factory.params['next_button'][1]) + if len(is_next_button) == 0: + logger.info('Reached last searcheable page..') + pass + else: + next_button = is_next_button[0] + while next_button.is_enabled(): + self.driver.execute_script("window.scrollTo(0, document.body.scrollHeight);") + next_button.click() + time.sleep(1.5) + df = pd.concat([df, self._parse_results(factory=factory)]) + df = pd.concat([df, self._parse_results(factory=factory)]) + + return df + + def _fetch_profiles(self, factory): + time.sleep(2) + self.apply_filters(factory) + df = self.grab_results(factory) + logger.info('Retrieved profiles') + return df + + def search_people(self, + search_keywords=None, location=None, industry=None, job_title=None, current_company=None, + default_url=None): + """Returns a dataframe of available profiles matching criteria, private profile are marked with '#' + + Parameters + ---------- + search_keywords: str + Default search keyword to be used. + location: [str, list] + Locations to be considered (can be multiple). + industry: [str, list] + Industries to be considered (can be multiple). + job_title: str + Specific job title to search for. For multiple selection, a for loop outside the function is necessary. + current_company: [str, list] + Current employer to be considered (can be multiple). + default_url: str + Default URL to use when applying filters and extracting results. + + Returns + ------- + DataFrame of retrieved profiles (columns=['URL', 'Name', 'Title', 'Location']). + """ + + factory = utils.Search(search_type='people', search_keywords=search_keywords, location=location, + industry=industry, job_title=job_title, current_company=current_company, + default_url=default_url) + assert isinstance(search_keywords, (str, type(None))) & isinstance(default_url, (str, type(None))), 'Keywords and default URL must be strings' + logger.info(f'Passed following parameters: {factory.search_object}\n') + self.driver.get(factory.default_url) + df = self._fetch_profiles(factory) + return df diff --git a/src/test_config.py b/src/test_config.py new file mode 100644 index 0000000..149e162 --- /dev/null +++ b/src/test_config.py @@ -0,0 +1,16 @@ +import unittest + + +class ElementValidity(unittest.TestCase): + def test_login_page(self): + pass + + def test_search_page(self): + pass + + def test_search_results(self): + pass + + +if __name__ == '__main__': + unittest.main() diff --git a/src/utils.py b/src/utils.py new file mode 100644 index 0000000..1392af6 --- /dev/null +++ b/src/utils.py @@ -0,0 +1,25 @@ +from src.config import ( + SEARCH_PEOPLE_DEFAULT_URL, + SEARCH_JOBS_DEFAULT_URL, + PARAMS, +) + + +class Search(object): + + def __init__(self, search_type, **kwargs): + self._search_type = f'search.{search_type}' + + self.default_url = PARAMS[self._search_type]['default_url'] + + self.params = PARAMS['search'] + self.params.update(PARAMS[self._search_type]) # print(self.params) + self.default_url = self.params['default_url'] + # self.secondary_params = PARAMS[self._search_type] + self.selected_params = kwargs + + self._generate_search_object() + + def _generate_search_object(self): + self.search_object = { + k: self.params[k] + [v] for k, v in self.selected_params.items() if v}