From 30316f0f7b69dfc17443da32fe8b7fb835f10dc3 Mon Sep 17 00:00:00 2001 From: imukoki <60528574+imukoki@users.noreply.github.com> Date: Thu, 17 Nov 2022 19:05:17 +0200 Subject: [PATCH 1/9] Created using Colaboratory --- Webscraping2.ipynb | 759 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 759 insertions(+) create mode 100644 Webscraping2.ipynb diff --git a/Webscraping2.ipynb b/Webscraping2.ipynb new file mode 100644 index 0000000..e56a62e --- /dev/null +++ b/Webscraping2.ipynb @@ -0,0 +1,759 @@ +{ + "nbformat": 4, + "nbformat_minor": 0, + "metadata": { + "colab": { + "provenance": [], + "authorship_tag": "ABX9TyOFMmAMVvajwCa+6KvYpGo9", + "include_colab_link": true + }, + "kernelspec": { + "name": "python3", + "display_name": "Python 3" + }, + "language_info": { + "name": "python" + } + }, + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "L1KWrQ6slzA2", + "outputId": "3303657e-ca69-4e52-d309-a2fea730edc2" + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "
\n", + "
​​​​​​​Organizational Development and Accountability Advisor\n", + "
\n", + "

\n", + "Trócaire |\n", + " Published on 17-11-2022 |\n", + " Deadline \n", + "
\n", + " Senior (5+ years of experience)
\n", + "Job\n", + "

\n", + "
" + ] + }, + "metadata": {}, + "execution_count": 62 + } + ], + "source": [ + "import requests\n", + "from bs4 import BeautifulSoup\n", + "link = 'https://www.jobinrwanda.com'\n", + "content = requests.get(link).content\n", + "raw_content = BeautifulSoup(content, 'html.parser').find_all('div', class_='card-body p-2')\n", + "raw_content[0]" + ] + }, + { + "cell_type": "code", + "source": [ + "raw_content[0].find('a')['href']" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 36 + }, + "id": "yV0112WCqg8B", + "outputId": "8e4b1426-75aa-4756-f0ee-8b58edf5ba28" + }, + "execution_count": 63, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'/job/organizational-development-and-accountability-advisor'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 63 + } + ] + }, + { + "cell_type": "code", + "source": [ + "raw_content[0].find('span').getText() # Title of each job alert\n", + "raw_content[0].find('a')['href'] # Link to the content of each job title\n", + "raw_content[0].find('p').find('a')['href'] # Link to the institution that has posted the advert\n", + "raw_content[0].find('p').find('a').getText() # This is the name of the company\n", + "raw_content[0].find('p').find('span').getText() # Type of advert" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 36 + }, + "id": "_1IwLEoQq4h3", + "outputId": "12407498-c823-4573-9f45-8870a352e8d8" + }, + "execution_count": 64, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'Job'" + ], + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + } + }, + "metadata": {}, + "execution_count": 64 + } + ] + }, + { + "cell_type": "code", + "source": [ + "import pandas as pd\n", + "df = pd.DataFrame(columns=['link','title','company_link','company','job_type'])\n", + "\n", + "def parse_content(adverts):\n", + " link = []\n", + " title = []\n", + " company = []\n", + " company_link = []\n", + " job_type = []\n", + " for advert in adverts:\n", + " link.append('https://www.jobinrwanda.com' + advert.find('a')['href'])\n", + " title.append(advert.find('span').getText())\n", + " company.append(advert.find('p').find('a').getText())\n", + " company_link.append(advert.find('p').find('a')['href'])\n", + " job_type.append(advert.find('p').find('span').getText())\n", + " return link, title, company, company_link, job_type" + ], + "metadata": { + "id": "fHrMvcfUun8Q" + }, + "execution_count": 65, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "link = 'https://www.jobinrwanda.com'\n", + "content = requests.get(link).content\n", + "raw_content = BeautifulSoup(content, 'html.parser').find_all('div', class_='card-body p-2')\n", + "df['link'], df['title'], df['company'], df['company_link'], df['job_type'] = parse_content(raw_content)\n", + "df" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 424 + }, + "id": "5vePYScS2ctx", + "outputId": "a0cc239f-599c-40d0-ce39-6ccc020d1480" + }, + "execution_count": 66, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " link \\\n", + "0 https://www.jobinrwanda.com/job/organizational... \n", + "1 https://www.jobinrwanda.com/job/monitoring-inc... \n", + "2 https://www.jobinrwanda.com/job/resource-mobil... \n", + "3 https://www.jobinrwanda.com/job/request-propos... \n", + "4 https://www.jobinrwanda.com/job/finance-manage... \n", + ".. ... \n", + "187 https://www.jobinrwanda.com/job/rwanda-tree-lead \n", + "188 https://www.jobinrwanda.com/index.php/job/rwan... \n", + "189 https://www.jobinrwanda.com/job/it-operations-... \n", + "190 https://www.jobinrwanda.com/job/rwanda-seed-in... \n", + "191 https://www.jobinrwanda.com/job/senior-busines... \n", + "\n", + " title \\\n", + "0 ​​​​​​​Organizational Development and Accounta... \n", + "1 Monitoring, Inclusion and Learning Advisor \n", + "2 Resource Mobilization and Grant Manager \n", + "3 Request for Proposals for Electronic Logistics... \n", + "4 Finance Manager \n", + ".. ... \n", + "187 Rwanda Tree Lead \n", + "188 Rwanda Potato Seed Venture Lead \n", + "189 IT Operations Senior Manager \n", + "190 Rwanda Seed Innovation Centre Lead \n", + "191 Senior Business Analyst \n", + "\n", + " company_link company \\\n", + "0 /employer/trocaire Trócaire \n", + "1 /employer/trocaire Trócaire \n", + "2 /employer/trocaire Trócaire \n", + "3 /employer/chemonics-international-inc Chemonics International Inc. \n", + "4 /employer/womens-bakery-1 The Women's Bakery \n", + ".. ... ... \n", + "187 /employer/one-acre-fund One Acre Fund \n", + "188 /index.php/employer/one-acre-fund One Acre Fund \n", + "189 /employer/one-acre-fund One Acre Fund \n", + "190 /employer/one-acre-fund One Acre Fund \n", + "191 /employer/one-acre-fund One Acre Fund \n", + "\n", + " job_type \n", + "0 Job \n", + "1 Job \n", + "2 Job \n", + "3 Tender \n", + "4 Job \n", + ".. ... \n", + "187 Job \n", + "188 Job \n", + "189 Job \n", + "190 Job \n", + "191 Job \n", + "\n", + "[192 rows x 5 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
linktitlecompany_linkcompanyjob_type
0https://www.jobinrwanda.com/job/organizational...​​​​​​​Organizational Development and Accounta.../employer/trocaireTrócaireJob
1https://www.jobinrwanda.com/job/monitoring-inc...Monitoring, Inclusion and Learning Advisor/employer/trocaireTrócaireJob
2https://www.jobinrwanda.com/job/resource-mobil...Resource Mobilization and Grant Manager/employer/trocaireTrócaireJob
3https://www.jobinrwanda.com/job/request-propos...Request for Proposals for Electronic Logistics.../employer/chemonics-international-incChemonics International Inc.Tender
4https://www.jobinrwanda.com/job/finance-manage...Finance Manager/employer/womens-bakery-1The Women's BakeryJob
..................
187https://www.jobinrwanda.com/job/rwanda-tree-leadRwanda Tree Lead/employer/one-acre-fundOne Acre FundJob
188https://www.jobinrwanda.com/index.php/job/rwan...Rwanda Potato Seed Venture Lead/index.php/employer/one-acre-fundOne Acre FundJob
189https://www.jobinrwanda.com/job/it-operations-...IT Operations Senior Manager/employer/one-acre-fundOne Acre FundJob
190https://www.jobinrwanda.com/job/rwanda-seed-in...Rwanda Seed Innovation Centre Lead/employer/one-acre-fundOne Acre FundJob
191https://www.jobinrwanda.com/job/senior-busines...Senior Business Analyst/employer/one-acre-fundOne Acre FundJob
\n", + "

192 rows × 5 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 66 + } + ] + }, + { + "cell_type": "code", + "source": [ + "def get_description(job_link):\n", + " job_description_raw_content = requests.get(job_link).content # request raw content\n", + " job_description_bs = BeautifulSoup(job_description_raw_content, 'html.parser')\n", + " employer_description = job_description_bs.find_all('div', class_='employer-description')[0].getText()\n", + "\n", + " describe = job_description_bs.find_all('div', class_='clearfix text-formatted field field--name-field-job-full-description field--type-text-long field--label-hidden field__item')\n", + " return employer_description\n" + ], + "metadata": { + "id": "YDXK8fZX7ohf" + }, + "execution_count": 67, + "outputs": [] + }, + { + "cell_type": "code", + "source": [ + "# Adding description\n", + "descr = []\n", + "for index in range(len(df)):\n", + " try:\n", + " descr.append(get_description(df['link'][index]))\n", + " except IndexError:\n", + " descr.append('NA')\n", + "df['description'] = descr\n", + "df\n" + ], + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 641 + }, + "id": "6GIEX9he4kIS", + "outputId": "17bf7376-90d2-428a-f868-e1f253ffe302" + }, + "execution_count": 68, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + " link \\\n", + "0 https://www.jobinrwanda.com/job/organizational... \n", + "1 https://www.jobinrwanda.com/job/monitoring-inc... \n", + "2 https://www.jobinrwanda.com/job/resource-mobil... \n", + "3 https://www.jobinrwanda.com/job/request-propos... \n", + "4 https://www.jobinrwanda.com/job/finance-manage... \n", + ".. ... \n", + "187 https://www.jobinrwanda.com/job/rwanda-tree-lead \n", + "188 https://www.jobinrwanda.com/index.php/job/rwan... \n", + "189 https://www.jobinrwanda.com/job/it-operations-... \n", + "190 https://www.jobinrwanda.com/job/rwanda-seed-in... \n", + "191 https://www.jobinrwanda.com/job/senior-busines... \n", + "\n", + " title \\\n", + "0 ​​​​​​​Organizational Development and Accounta... \n", + "1 Monitoring, Inclusion and Learning Advisor \n", + "2 Resource Mobilization and Grant Manager \n", + "3 Request for Proposals for Electronic Logistics... \n", + "4 Finance Manager \n", + ".. ... \n", + "187 Rwanda Tree Lead \n", + "188 Rwanda Potato Seed Venture Lead \n", + "189 IT Operations Senior Manager \n", + "190 Rwanda Seed Innovation Centre Lead \n", + "191 Senior Business Analyst \n", + "\n", + " company_link company \\\n", + "0 /employer/trocaire Trócaire \n", + "1 /employer/trocaire Trócaire \n", + "2 /employer/trocaire Trócaire \n", + "3 /employer/chemonics-international-inc Chemonics International Inc. \n", + "4 /employer/womens-bakery-1 The Women's Bakery \n", + ".. ... ... \n", + "187 /employer/one-acre-fund One Acre Fund \n", + "188 /index.php/employer/one-acre-fund One Acre Fund \n", + "189 /employer/one-acre-fund One Acre Fund \n", + "190 /employer/one-acre-fund One Acre Fund \n", + "191 /employer/one-acre-fund One Acre Fund \n", + "\n", + " job_type description \n", + "0 Job \\nTrócaire is the official overseas developmen... \n", + "1 Job \\nTrócaire is the official overseas developmen... \n", + "2 Job \\nTrócaire is the official overseas developmen... \n", + "3 Tender NA \n", + "4 Job NA \n", + ".. ... ... \n", + "187 Job \\nFounded in 2006, One Acre Fund supplies 1 mi... \n", + "188 Job \\nFounded in 2006, One Acre Fund supplies 1 mi... \n", + "189 Job \\nFounded in 2006, One Acre Fund supplies 1 mi... \n", + "190 Job \\nFounded in 2006, One Acre Fund supplies 1 mi... \n", + "191 Job \\nFounded in 2006, One Acre Fund supplies 1 mi... \n", + "\n", + "[192 rows x 6 columns]" + ], + "text/html": [ + "\n", + "
\n", + "
\n", + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
linktitlecompany_linkcompanyjob_typedescription
0https://www.jobinrwanda.com/job/organizational...​​​​​​​Organizational Development and Accounta.../employer/trocaireTrócaireJob\\nTrócaire is the official overseas developmen...
1https://www.jobinrwanda.com/job/monitoring-inc...Monitoring, Inclusion and Learning Advisor/employer/trocaireTrócaireJob\\nTrócaire is the official overseas developmen...
2https://www.jobinrwanda.com/job/resource-mobil...Resource Mobilization and Grant Manager/employer/trocaireTrócaireJob\\nTrócaire is the official overseas developmen...
3https://www.jobinrwanda.com/job/request-propos...Request for Proposals for Electronic Logistics.../employer/chemonics-international-incChemonics International Inc.TenderNA
4https://www.jobinrwanda.com/job/finance-manage...Finance Manager/employer/womens-bakery-1The Women's BakeryJobNA
.....................
187https://www.jobinrwanda.com/job/rwanda-tree-leadRwanda Tree Lead/employer/one-acre-fundOne Acre FundJob\\nFounded in 2006, One Acre Fund supplies 1 mi...
188https://www.jobinrwanda.com/index.php/job/rwan...Rwanda Potato Seed Venture Lead/index.php/employer/one-acre-fundOne Acre FundJob\\nFounded in 2006, One Acre Fund supplies 1 mi...
189https://www.jobinrwanda.com/job/it-operations-...IT Operations Senior Manager/employer/one-acre-fundOne Acre FundJob\\nFounded in 2006, One Acre Fund supplies 1 mi...
190https://www.jobinrwanda.com/job/rwanda-seed-in...Rwanda Seed Innovation Centre Lead/employer/one-acre-fundOne Acre FundJob\\nFounded in 2006, One Acre Fund supplies 1 mi...
191https://www.jobinrwanda.com/job/senior-busines...Senior Business Analyst/employer/one-acre-fundOne Acre FundJob\\nFounded in 2006, One Acre Fund supplies 1 mi...
\n", + "

192 rows × 6 columns

\n", + "
\n", + " \n", + " \n", + " \n", + "\n", + " \n", + "
\n", + "
\n", + " " + ] + }, + "metadata": {}, + "execution_count": 68 + } + ] + } + ] +} \ No newline at end of file From 658af1fa1b12c030f64a58a3fcdaca275be9a14f Mon Sep 17 00:00:00 2001 From: imukoki <60528574+imukoki@users.noreply.github.com> Date: Mon, 21 Nov 2022 23:17:09 +0200 Subject: [PATCH 2/9] Update README.md --- README.md | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9de005d..7608c3c 100644 --- a/README.md +++ b/README.md @@ -1 +1,25 @@ -# NLP-FELLOWSHIP +# Web scraping + +## Description +The objective of this project is to scrap the job opportunities posted on [here] (https://www.jobinrwanda.com) using the requests library to interact with the API. A library called BeautifulSoup to extract the meaningful content from the html which is obtained when the request library interacts with the API. + + +## Code +Below is the code that is used to extract the required information from the jobs posts: + +* Title of each job +`content.find('span').getText()` +* Link to the content of each job title +`content.find('a')['href']` +* Link to the institution that has posted the advert +`content.find('p').find('a')['href']` +* Name of the company +`content.find('p').find('a').getText()` +* Type of advert +`content.find('p').find('span').getText()` +* Company description +`content.find_all('div', class_='employer-description')[0].getText()` + +## Output +Below is the dataframe that was created from the scrapped data +![Company](https://user-images.githubusercontent.com/60528574/203159813-a73e32a1-7e57-476d-8e9e-6c9735df4df1.PNG) From abc8e7c31ec6a79a1bddbd04206e00cd840c35ef Mon Sep 17 00:00:00 2001 From: imukoki <60528574+imukoki@users.noreply.github.com> Date: Mon, 21 Nov 2022 23:18:01 +0200 Subject: [PATCH 3/9] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 7608c3c..242f85f 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Web scraping ## Description -The objective of this project is to scrap the job opportunities posted on [here] (https://www.jobinrwanda.com) using the requests library to interact with the API. A library called BeautifulSoup to extract the meaningful content from the html which is obtained when the request library interacts with the API. +The objective of this project is to scrap the job opportunities posted on ([here] (https://www.jobinrwanda.com)) using the requests library to interact with the API. A library called BeautifulSoup to extract the meaningful content from the html which is obtained when the request library interacts with the API. ## Code From cf8969160e6e45e4d900dee2e8ef898a03f3227c Mon Sep 17 00:00:00 2001 From: imukoki <60528574+imukoki@users.noreply.github.com> Date: Mon, 21 Nov 2022 23:19:36 +0200 Subject: [PATCH 4/9] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 242f85f..b64d5d8 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Web scraping ## Description -The objective of this project is to scrap the job opportunities posted on ([here] (https://www.jobinrwanda.com)) using the requests library to interact with the API. A library called BeautifulSoup to extract the meaningful content from the html which is obtained when the request library interacts with the API. +The objective of this project is to scrap the job opportunities posted [here] (https://www.jobinrwanda.com) using the requests library to interact with the API. A library called BeautifulSoup to extract the meaningful content from the html which is obtained when the request library interacts with the API. ## Code From 3486e39e9aef97dfa748557ae1ad6a67812fc255 Mon Sep 17 00:00:00 2001 From: imukoki <60528574+imukoki@users.noreply.github.com> Date: Mon, 21 Nov 2022 23:20:26 +0200 Subject: [PATCH 5/9] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index b64d5d8..e372c3f 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Web scraping ## Description -The objective of this project is to scrap the job opportunities posted [here] (https://www.jobinrwanda.com) using the requests library to interact with the API. A library called BeautifulSoup to extract the meaningful content from the html which is obtained when the request library interacts with the API. +The objective of this project is to scrap the job opportunities posted (https://www.jobinrwanda.com) using the requests library to interact with the API. A library called BeautifulSoup to extract the meaningful content from the html which is obtained when the request library interacts with the API. ## Code From 6977eafe5ee5bc789bac12174e446e3f2733aca4 Mon Sep 17 00:00:00 2001 From: imukoki <60528574+imukoki@users.noreply.github.com> Date: Mon, 21 Nov 2022 23:21:56 +0200 Subject: [PATCH 6/9] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e372c3f..c2cfe73 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # Web scraping ## Description -The objective of this project is to scrap the job opportunities posted (https://www.jobinrwanda.com) using the requests library to interact with the API. A library called BeautifulSoup to extract the meaningful content from the html which is obtained when the request library interacts with the API. +The objective of this project is to scrap the job opportunities posted [here](https://www.jobinrwanda.com) using the requests library to interact with the API. A library called BeautifulSoup to extract the meaningful content from the html which is obtained when the request library interacts with the API. ## Code From 9fc7bfbe9d0eaa10749083d34fde27f8033056a1 Mon Sep 17 00:00:00 2001 From: imukoki <60528574+imukoki@users.noreply.github.com> Date: Mon, 21 Nov 2022 23:31:41 +0200 Subject: [PATCH 7/9] Update README.md --- README.md | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/README.md b/README.md index c2cfe73..4e98090 100644 --- a/README.md +++ b/README.md @@ -23,3 +23,18 @@ Below is the code that is used to extract the required information from the jobs ## Output Below is the dataframe that was created from the scrapped data ![Company](https://user-images.githubusercontent.com/60528574/203159813-a73e32a1-7e57-476d-8e9e-6c9735df4df1.PNG) + +# Text to features +The other objective of the project was to convert a list of sentences to features. Text to features is the process of converting tokens to numbers. This is because the machine only works with numbers. Moreover, for manipulation of text, the tokens need to be in digit form to apply any transformations. + +The input of the function will be the matrix of tokens and output will be matrix with digits. + +## Simplest form of featurization +The simplest way is to assign each unique text a number starting from 0 and increase by one until all the text has been assigned numbers + +## Bag Of Words (BoW) +* Split the sentences into words +* Create a dictionary with all unique words and their indices +* Create a vector, size same as the total number of unique words +* For every word in a sentence, get the index and add 1. +* The result will be a vector for each sentence with length same as all the unique words in all sentences, with frequency of each word in one particular sentence. If a word is not in that sentence, the frequency is 0 From ec125a036d4096b7e66b76ee8ed294fa5754b477 Mon Sep 17 00:00:00 2001 From: imukoki <60528574+imukoki@users.noreply.github.com> Date: Mon, 21 Nov 2022 23:34:24 +0200 Subject: [PATCH 8/9] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 4e98090..b76892a 100644 --- a/README.md +++ b/README.md @@ -29,6 +29,8 @@ The other objective of the project was to convert a list of sentences to feature The input of the function will be the matrix of tokens and output will be matrix with digits. +Two ways of converting text to features were used in the project which are the simplest form of featurization and bag of words (BOW) + ## Simplest form of featurization The simplest way is to assign each unique text a number starting from 0 and increase by one until all the text has been assigned numbers From 86b4689202589b5268e516f0caf41fe653d20278 Mon Sep 17 00:00:00 2001 From: imukoki <60528574+imukoki@users.noreply.github.com> Date: Tue, 22 Nov 2022 00:01:12 +0200 Subject: [PATCH 9/9] Update README.md --- README.md | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/README.md b/README.md index b76892a..ad26bad 100644 --- a/README.md +++ b/README.md @@ -24,19 +24,4 @@ Below is the code that is used to extract the required information from the jobs Below is the dataframe that was created from the scrapped data ![Company](https://user-images.githubusercontent.com/60528574/203159813-a73e32a1-7e57-476d-8e9e-6c9735df4df1.PNG) -# Text to features -The other objective of the project was to convert a list of sentences to features. Text to features is the process of converting tokens to numbers. This is because the machine only works with numbers. Moreover, for manipulation of text, the tokens need to be in digit form to apply any transformations. -The input of the function will be the matrix of tokens and output will be matrix with digits. - -Two ways of converting text to features were used in the project which are the simplest form of featurization and bag of words (BOW) - -## Simplest form of featurization -The simplest way is to assign each unique text a number starting from 0 and increase by one until all the text has been assigned numbers - -## Bag Of Words (BoW) -* Split the sentences into words -* Create a dictionary with all unique words and their indices -* Create a vector, size same as the total number of unique words -* For every word in a sentence, get the index and add 1. -* The result will be a vector for each sentence with length same as all the unique words in all sentences, with frequency of each word in one particular sentence. If a word is not in that sentence, the frequency is 0