diff --git a/README.md b/README.md index 9de005d..ad26bad 100644 --- a/README.md +++ b/README.md @@ -1 +1,27 @@ -# NLP-FELLOWSHIP +# Web scraping + +## Description +The objective of this project is to scrap the job opportunities posted [here](https://www.jobinrwanda.com) using the requests library to interact with the API. A library called BeautifulSoup to extract the meaningful content from the html which is obtained when the request library interacts with the API. + + +## Code +Below is the code that is used to extract the required information from the jobs posts: + +* Title of each job +`content.find('span').getText()` +* Link to the content of each job title +`content.find('a')['href']` +* Link to the institution that has posted the advert +`content.find('p').find('a')['href']` +* Name of the company +`content.find('p').find('a').getText()` +* Type of advert +`content.find('p').find('span').getText()` +* Company description +`content.find_all('div', class_='employer-description')[0].getText()` + +## Output +Below is the dataframe that was created from the scrapped data +![Company](https://user-images.githubusercontent.com/60528574/203159813-a73e32a1-7e57-476d-8e9e-6c9735df4df1.PNG) + + diff --git a/Webscraping2.ipynb b/Webscraping2.ipynb new file mode 100644 index 0000000..e56a62e --- /dev/null +++ b/Webscraping2.ipynb @@ -0,0 +1,759 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "authorship_tag": "ABX9TyOFMmAMVvajwCa+6KvYpGo9", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "L1KWrQ6slzA2", + "outputId": "3303657e-ca69-4e52-d309-a2fea730edc2" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "
\n", + "
​​​​​​​Organizational Development and Accountability Advisor\n", + "
\n", + "

\n", + "Trócaire |\n", + " Published on 17-11-2022 |\n", + " Deadline \n", + "
\n", + " Senior (5+ years of experience)
\n", + "Job\n", + "

\n", + "
" + ] + }, + "metadata": {}, + "execution_count": 62 + } + ], + "source": [ + "import requests\n", + "from bs4 import BeautifulSoup\n", + "link = 'https://www.jobinrwanda.com'\n", + "content = requests.get(link).content\n", + "raw_content = BeautifulSoup(content, 'html.parser').find_all('div', class_='card-body p-2')\n", + "raw_content[0]" + ] + }, + { + "cell_type": "code", + "source": [ + "raw_content[0].find('a')['href']" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 36 + }, + "id": "yV0112WCqg8B", + "outputId": "8e4b1426-75aa-4756-f0ee-8b58edf5ba28" + }, + "execution_count": 63, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'/job/organizational-development-and-accountability-advisor'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 63 + } + ] + }, + { + "cell_type": "code", + "source": [ + "raw_content[0].find('span').getText() # Title of each job alert\n", + "raw_content[0].find('a')['href'] # Link to the content of each job title\n", + "raw_content[0].find('p').find('a')['href'] # Link to the institution that has posted the advert\n", + "raw_content[0].find('p').find('a').getText() # This is the name of the company\n", + "raw_content[0].find('p').find('span').getText() # Type of advert" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 36 + }, + "id": "_1IwLEoQq4h3", + "outputId": "12407498-c823-4573-9f45-8870a352e8d8" + }, + "execution_count": 64, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'Job'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 64 + } + ] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "df = pd.DataFrame(columns=['link','title','company_link','company','job_type'])\n", + "\n", + "def parse_content(adverts):\n", + " link = []\n", + " title = []\n", + " company = []\n", + " company_link = []\n", + " job_type = []\n", + " for advert in adverts:\n", + " link.append('https://www.jobinrwanda.com' + advert.find('a')['href'])\n", + " title.append(advert.find('span').getText())\n", + " company.append(advert.find('p').find('a').getText())\n", + " company_link.append(advert.find('p').find('a')['href'])\n", + " job_type.append(advert.find('p').find('span').getText())\n", + " return link, title, company, company_link, job_type" + ], + "metadata": { + "id": "fHrMvcfUun8Q" + }, + "execution_count": 65, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "link = 'https://www.jobinrwanda.com'\n", + "content = requests.get(link).content\n", + "raw_content = BeautifulSoup(content, 'html.parser').find_all('div', class_='card-body p-2')\n", + "df['link'], df['title'], df['company'], df['company_link'], df['job_type'] = parse_content(raw_content)\n", + "df" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "5vePYScS2ctx", + "outputId": "a0cc239f-599c-40d0-ce39-6ccc020d1480" + }, + "execution_count": 66, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " link \\\n", + "0 https://www.jobinrwanda.com/job/organizational... \n", + "1 https://www.jobinrwanda.com/job/monitoring-inc... \n", + "2 https://www.jobinrwanda.com/job/resource-mobil... \n", + "3 https://www.jobinrwanda.com/job/request-propos... \n", + "4 https://www.jobinrwanda.com/job/finance-manage... \n", + ".. ... \n", + "187 https://www.jobinrwanda.com/job/rwanda-tree-lead \n", + "188 https://www.jobinrwanda.com/index.php/job/rwan... \n", + "189 https://www.jobinrwanda.com/job/it-operations-... \n", + "190 https://www.jobinrwanda.com/job/rwanda-seed-in... \n", + "191 https://www.jobinrwanda.com/job/senior-busines... \n", + "\n", + " title \\\n", + "0 ​​​​​​​Organizational Development and Accounta... \n", + "1 Monitoring, Inclusion and Learning Advisor \n", + "2 Resource Mobilization and Grant Manager \n", + "3 Request for Proposals for Electronic Logistics... \n", + "4 Finance Manager \n", + ".. ... \n", + "187 Rwanda Tree Lead \n", + "188 Rwanda Potato Seed Venture Lead \n", + "189 IT Operations Senior Manager \n", + "190 Rwanda Seed Innovation Centre Lead \n", + "191 Senior Business Analyst \n", + "\n", + " company_link company \\\n", + "0 /employer/trocaire Trócaire \n", + "1 /employer/trocaire Trócaire \n", + "2 /employer/trocaire Trócaire \n", + "3 /employer/chemonics-international-inc Chemonics International Inc. \n", + "4 /employer/womens-bakery-1 The Women's Bakery \n", + ".. ... ... \n", + "187 /employer/one-acre-fund One Acre Fund \n", + "188 /index.php/employer/one-acre-fund One Acre Fund \n", + "189 /employer/one-acre-fund One Acre Fund \n", + "190 /employer/one-acre-fund One Acre Fund \n", + "191 /employer/one-acre-fund One Acre Fund \n", + "\n", + " job_type \n", + "0 Job \n", + "1 Job \n", + "2 Job \n", + "3 Tender \n", + "4 Job \n", + ".. ... \n", + "187 Job \n", + "188 Job \n", + "189 Job \n", + "190 Job \n", + "191 Job \n", + "\n", + "[192 rows x 5 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
linktitlecompany_linkcompanyjob_type
0https://www.jobinrwanda.com/job/organizational...​​​​​​​Organizational Development and Accounta.../employer/trocaireTrócaireJob
1https://www.jobinrwanda.com/job/monitoring-inc...Monitoring, Inclusion and Learning Advisor/employer/trocaireTrócaireJob
2https://www.jobinrwanda.com/job/resource-mobil...Resource Mobilization and Grant Manager/employer/trocaireTrócaireJob
3https://www.jobinrwanda.com/job/request-propos...Request for Proposals for Electronic Logistics.../employer/chemonics-international-incChemonics International Inc.Tender
4https://www.jobinrwanda.com/job/finance-manage...Finance Manager/employer/womens-bakery-1The Women's BakeryJob
..................
187https://www.jobinrwanda.com/job/rwanda-tree-leadRwanda Tree Lead/employer/one-acre-fundOne Acre FundJob
188https://www.jobinrwanda.com/index.php/job/rwan...Rwanda Potato Seed Venture Lead/index.php/employer/one-acre-fundOne Acre FundJob
189https://www.jobinrwanda.com/job/it-operations-...IT Operations Senior Manager/employer/one-acre-fundOne Acre FundJob
190https://www.jobinrwanda.com/job/rwanda-seed-in...Rwanda Seed Innovation Centre Lead/employer/one-acre-fundOne Acre FundJob
191https://www.jobinrwanda.com/job/senior-busines...Senior Business Analyst/employer/one-acre-fundOne Acre FundJob
\n", + "

192 rows × 5 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 66 + } + ] + }, + { + "cell_type": "code", + "source": [ + "def get_description(job_link):\n", + " job_description_raw_content = requests.get(job_link).content # request raw content\n", + " job_description_bs = BeautifulSoup(job_description_raw_content, 'html.parser')\n", + " employer_description = job_description_bs.find_all('div', class_='employer-description')[0].getText()\n", + "\n", + " describe = job_description_bs.find_all('div', class_='clearfix text-formatted field field--name-field-job-full-description field--type-text-long field--label-hidden field__item')\n", + " return employer_description\n" + ], + "metadata": { + "id": "YDXK8fZX7ohf" + }, + "execution_count": 67, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Adding description\n", + "descr = []\n", + "for index in range(len(df)):\n", + " try:\n", + " descr.append(get_description(df['link'][index]))\n", + " except IndexError:\n", + " descr.append('NA')\n", + "df['description'] = descr\n", + "df\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 641 + }, + "id": "6GIEX9he4kIS", + "outputId": "17bf7376-90d2-428a-f868-e1f253ffe302" + }, + "execution_count": 68, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " link \\\n", + "0 https://www.jobinrwanda.com/job/organizational... \n", + "1 https://www.jobinrwanda.com/job/monitoring-inc... \n", + "2 https://www.jobinrwanda.com/job/resource-mobil... \n", + "3 https://www.jobinrwanda.com/job/request-propos... \n", + "4 https://www.jobinrwanda.com/job/finance-manage... \n", + ".. ... \n", + "187 https://www.jobinrwanda.com/job/rwanda-tree-lead \n", + "188 https://www.jobinrwanda.com/index.php/job/rwan... \n", + "189 https://www.jobinrwanda.com/job/it-operations-... \n", + "190 https://www.jobinrwanda.com/job/rwanda-seed-in... \n", + "191 https://www.jobinrwanda.com/job/senior-busines... \n", + "\n", + " title \\\n", + "0 ​​​​​​​Organizational Development and Accounta... \n", + "1 Monitoring, Inclusion and Learning Advisor \n", + "2 Resource Mobilization and Grant Manager \n", + "3 Request for Proposals for Electronic Logistics... \n", + "4 Finance Manager \n", + ".. ... \n", + "187 Rwanda Tree Lead \n", + "188 Rwanda Potato Seed Venture Lead \n", + "189 IT Operations Senior Manager \n", + "190 Rwanda Seed Innovation Centre Lead \n", + "191 Senior Business Analyst \n", + "\n", + " company_link company \\\n", + "0 /employer/trocaire Trócaire \n", + "1 /employer/trocaire Trócaire \n", + "2 /employer/trocaire Trócaire \n", + "3 /employer/chemonics-international-inc Chemonics International Inc. \n", + "4 /employer/womens-bakery-1 The Women's Bakery \n", + ".. ... ... \n", + "187 /employer/one-acre-fund One Acre Fund \n", + "188 /index.php/employer/one-acre-fund One Acre Fund \n", + "189 /employer/one-acre-fund One Acre Fund \n", + "190 /employer/one-acre-fund One Acre Fund \n", + "191 /employer/one-acre-fund One Acre Fund \n", + "\n", + " job_type description \n", + "0 Job \\nTrócaire is the official overseas developmen... \n", + "1 Job \\nTrócaire is the official overseas developmen... \n", + "2 Job \\nTrócaire is the official overseas developmen... \n", + "3 Tender NA \n", + "4 Job NA \n", + ".. ... ... \n", + "187 Job \\nFounded in 2006, One Acre Fund supplies 1 mi... \n", + "188 Job \\nFounded in 2006, One Acre Fund supplies 1 mi... \n", + "189 Job \\nFounded in 2006, One Acre Fund supplies 1 mi... \n", + "190 Job \\nFounded in 2006, One Acre Fund supplies 1 mi... \n", + "191 Job \\nFounded in 2006, One Acre Fund supplies 1 mi... \n", + "\n", + "[192 rows x 6 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
linktitlecompany_linkcompanyjob_typedescription
0https://www.jobinrwanda.com/job/organizational...​​​​​​​Organizational Development and Accounta.../employer/trocaireTrócaireJob\\nTrócaire is the official overseas developmen...
1https://www.jobinrwanda.com/job/monitoring-inc...Monitoring, Inclusion and Learning Advisor/employer/trocaireTrócaireJob\\nTrócaire is the official overseas developmen...
2https://www.jobinrwanda.com/job/resource-mobil...Resource Mobilization and Grant Manager/employer/trocaireTrócaireJob\\nTrócaire is the official overseas developmen...
3https://www.jobinrwanda.com/job/request-propos...Request for Proposals for Electronic Logistics.../employer/chemonics-international-incChemonics International Inc.TenderNA
4https://www.jobinrwanda.com/job/finance-manage...Finance Manager/employer/womens-bakery-1The Women's BakeryJobNA
.....................
187https://www.jobinrwanda.com/job/rwanda-tree-leadRwanda Tree Lead/employer/one-acre-fundOne Acre FundJob\\nFounded in 2006, One Acre Fund supplies 1 mi...
188https://www.jobinrwanda.com/index.php/job/rwan...Rwanda Potato Seed Venture Lead/index.php/employer/one-acre-fundOne Acre FundJob\\nFounded in 2006, One Acre Fund supplies 1 mi...
189https://www.jobinrwanda.com/job/it-operations-...IT Operations Senior Manager/employer/one-acre-fundOne Acre FundJob\\nFounded in 2006, One Acre Fund supplies 1 mi...
190https://www.jobinrwanda.com/job/rwanda-seed-in...Rwanda Seed Innovation Centre Lead/employer/one-acre-fundOne Acre FundJob\\nFounded in 2006, One Acre Fund supplies 1 mi...
191https://www.jobinrwanda.com/job/senior-busines...Senior Business Analyst/employer/one-acre-fundOne Acre FundJob\\nFounded in 2006, One Acre Fund supplies 1 mi...
\n", + "

192 rows × 6 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 68 + } + ] + } + ] +} \ No newline at end of file