Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

added init function #965

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion dev-documentation.md
Original file line number Diff line number Diff line change
Expand Up @@ -1629,7 +1629,7 @@ from scrape_up import ambitionBox

num_pages_to_scrape = 2

scraper = ambitionBox.Comapiens(num_pages_to_scrape)
scraper = ambitionBox.Companies(num_pages_to_scrape)

scraper.scrape_companies()

Expand Down
3 changes: 3 additions & 0 deletions src/scrape_up/ambitionBox/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from .company import Companies

__all__ = ["Companies"]
85 changes: 55 additions & 30 deletions src/scrape_up/ambitionBox/company.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,54 @@
import requests
from bs4 import BeautifulSoup


class Comapiens:
def __init__(self,num_pages: int=1):
class Companies:
"""
A class to scrape company information and ratings from the AmbitionBox website.

Attributes:
num_pages (int): The number of pages to scrape.

Methods:
__init__(num_pages=1):
Initializes the Companies object with the specified number of pages to scrape.

print_sorted_list(company_list):
Prints a list of companies sorted by their ratings in descending order.

scrape_companies():
Scrapes company information and ratings from the specified number of pages on the AmbitionBox website.
Categorizes and prints the companies based on their ratings.
"""

def __init__(self, num_pages: int = 1):
"""
Initializes the Companies object with the specified number of pages to scrape.

Args:
num_pages (int): The number of pages to scrape. Defaults to 1.
"""
self.num_pages = num_pages

def write_sorted_list(self, file, company_list):
def print_sorted_list(self, company_list):
"""
Prints a list of companies sorted by their ratings in descending order.

Args:
company_list (list): A list of tuples where each tuple contains the company name and its rating.
"""
company_list.sort(key=lambda x: x[1], reverse=True)
for company_name, rating in company_list:
file.write(f"{company_name.strip()} {rating}\n")
print(f"{company_name.strip()} {rating}")

def scrape_companies(self):


"""
Scrapes company information and ratings from the specified number of pages on the AmbitionBox website.
Categorizes and prints the companies based on their ratings.

The companies are categorized and printed as follows:
- Companies with 5 stars
- Companies with 4 stars
- Companies with 3 stars
- Companies with 2 stars
- Companies with 1 star
"""
headers = {
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36"
}
Expand All @@ -27,9 +61,7 @@ def scrape_companies(self):

if response.status_code == 200:
soup = BeautifulSoup(response.text, 'lxml')

companies = soup.find_all('div', class_="companyCardWrapper")

company_ratings = []

for company in companies:
Expand All @@ -43,27 +75,20 @@ def scrape_companies(self):
except ValueError:
print(f"Error parsing rating for company: {company_name}")

with open("src/scrape_up/ambitionBox/company_ratings.txt", "a") as f:
f.write(f"\nPAGE: {url}\n")
f.write("COMPANY UNDER 5 STAR\n")
self.write_sorted_list(f, [r for r in company_ratings if 4 < r[1] <= 5])
print(f"\nPAGE: {url}\n")
print("COMPANIES WITH 5 STARS\n")
self.print_sorted_list([r for r in company_ratings if 4 < r[1] <= 5])

f.write("\nCOMPANY UNDER 4 STAR\n")
self.write_sorted_list(f, [r for r in company_ratings if 3 < r[1] <= 4])
print("\nCOMPANIES WITH 4 STARS\n")
self.print_sorted_list([r for r in company_ratings if 3 < r[1] <= 4])

# Corrected indentation for following lines
f.write("\nCOMPANY UNDER 3 STAR\n")
self.write_sorted_list(f, [r for r in company_ratings if 2 < r[1] <= 3])
print("\nCOMPANIES WITH 3 STARS\n")
self.print_sorted_list([r for r in company_ratings if 2 < r[1] <= 3])

f.write("\nCOMPANY UNDER 2 STAR\n")
self.write_sorted_list(f, [r for r in company_ratings if 1 < r[1] <= 2])
print("\nCOMPANIES WITH 2 STARS\n")
self.print_sorted_list([r for r in company_ratings if 1 < r[1] <= 2])

f.write("\nCOMPANY UNDER 1 STAR\n")
self.write_sorted_list(f, [r for r in company_ratings if 0 < r[1] <= 1])
print("\nCOMPANIES WITH 1 STAR\n")
self.print_sorted_list([r for r in company_ratings if 0 < r[1] <= 1])
else:
print(f"Error scraping page {page}: {response.status_code}")


if __name__ == "__main__":
c = Comapiens(10)
c.scrape_companies()
Empty file.