diff --git a/dev-documentation.md b/dev-documentation.md index 7ff252de..0f10e99c 100644 --- a/dev-documentation.md +++ b/dev-documentation.md @@ -1629,7 +1629,7 @@ from scrape_up import ambitionBox num_pages_to_scrape = 2 -scraper = ambitionBox.Comapiens(num_pages_to_scrape) +scraper = ambitionBox.Companies(num_pages_to_scrape) scraper.scrape_companies() diff --git a/src/scrape_up/ambitionBox/__init__.py b/src/scrape_up/ambitionBox/__init__.py new file mode 100644 index 00000000..ca1826f0 --- /dev/null +++ b/src/scrape_up/ambitionBox/__init__.py @@ -0,0 +1,3 @@ +from .company import Companies + +__all__ = ["Companies"] diff --git a/src/scrape_up/ambitionBox/company.py b/src/scrape_up/ambitionBox/company.py index 49f30251..13411ba3 100644 --- a/src/scrape_up/ambitionBox/company.py +++ b/src/scrape_up/ambitionBox/company.py @@ -1,20 +1,54 @@ -import requests -from bs4 import BeautifulSoup - - -class Comapiens: - def __init__(self,num_pages: int=1): +class Companies: + """ + A class to scrape company information and ratings from the AmbitionBox website. + + Attributes: + num_pages (int): The number of pages to scrape. + + Methods: + __init__(num_pages=1): + Initializes the Companies object with the specified number of pages to scrape. + + print_sorted_list(company_list): + Prints a list of companies sorted by their ratings in descending order. + + scrape_companies(): + Scrapes company information and ratings from the specified number of pages on the AmbitionBox website. + Categorizes and prints the companies based on their ratings. + """ + + def __init__(self, num_pages: int = 1): + """ + Initializes the Companies object with the specified number of pages to scrape. + + Args: + num_pages (int): The number of pages to scrape. Defaults to 1. + """ self.num_pages = num_pages - def write_sorted_list(self, file, company_list): + def print_sorted_list(self, company_list): + """ + Prints a list of companies sorted by their ratings in descending order. + Args: + company_list (list): A list of tuples where each tuple contains the company name and its rating. + """ company_list.sort(key=lambda x: x[1], reverse=True) for company_name, rating in company_list: - file.write(f"{company_name.strip()} {rating}\n") + print(f"{company_name.strip()} {rating}") def scrape_companies(self): - - + """ + Scrapes company information and ratings from the specified number of pages on the AmbitionBox website. + Categorizes and prints the companies based on their ratings. + + The companies are categorized and printed as follows: + - Companies with 5 stars + - Companies with 4 stars + - Companies with 3 stars + - Companies with 2 stars + - Companies with 1 star + """ headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36" } @@ -27,9 +61,7 @@ def scrape_companies(self): if response.status_code == 200: soup = BeautifulSoup(response.text, 'lxml') - companies = soup.find_all('div', class_="companyCardWrapper") - company_ratings = [] for company in companies: @@ -43,27 +75,20 @@ def scrape_companies(self): except ValueError: print(f"Error parsing rating for company: {company_name}") - with open("src/scrape_up/ambitionBox/company_ratings.txt", "a") as f: - f.write(f"\nPAGE: {url}\n") - f.write("COMPANY UNDER 5 STAR\n") - self.write_sorted_list(f, [r for r in company_ratings if 4 < r[1] <= 5]) + print(f"\nPAGE: {url}\n") + print("COMPANIES WITH 5 STARS\n") + self.print_sorted_list([r for r in company_ratings if 4 < r[1] <= 5]) - f.write("\nCOMPANY UNDER 4 STAR\n") - self.write_sorted_list(f, [r for r in company_ratings if 3 < r[1] <= 4]) + print("\nCOMPANIES WITH 4 STARS\n") + self.print_sorted_list([r for r in company_ratings if 3 < r[1] <= 4]) - # Corrected indentation for following lines - f.write("\nCOMPANY UNDER 3 STAR\n") - self.write_sorted_list(f, [r for r in company_ratings if 2 < r[1] <= 3]) + print("\nCOMPANIES WITH 3 STARS\n") + self.print_sorted_list([r for r in company_ratings if 2 < r[1] <= 3]) - f.write("\nCOMPANY UNDER 2 STAR\n") - self.write_sorted_list(f, [r for r in company_ratings if 1 < r[1] <= 2]) + print("\nCOMPANIES WITH 2 STARS\n") + self.print_sorted_list([r for r in company_ratings if 1 < r[1] <= 2]) - f.write("\nCOMPANY UNDER 1 STAR\n") - self.write_sorted_list(f, [r for r in company_ratings if 0 < r[1] <= 1]) + print("\nCOMPANIES WITH 1 STAR\n") + self.print_sorted_list([r for r in company_ratings if 0 < r[1] <= 1]) else: print(f"Error scraping page {page}: {response.status_code}") - - -if __name__ == "__main__": - c = Comapiens(10) - c.scrape_companies() diff --git a/src/scrape_up/ambitionBox/company_ratings.txt b/src/scrape_up/ambitionBox/company_ratings.txt deleted file mode 100644 index e69de29b..00000000