Skip to content

Commit

Permalink
v3.3.0
Browse files Browse the repository at this point in the history
- Added --workers to adjust pool thread workers for Thread Pool Scraper
- Use Firefox instead of Chrome
- Adjusted wait time for clicking pop-up ad while scrolling down to 1 second
- Adjusted wait time for clicking load more result button to 1 second
- Adjusted wait time for clicking pop-up ad to 2 seconds
- Adjusted scroll-down length to 2,000 pixels
- Use logging instead of loguru
- Adjusted scripts to use the same logger for all scrapers
- Added driver wait for clicking 'load more result' button
- Fixed 'check_if_current_date_has_passed' function bug for Month End Scraper
- Added more tests
- Added logic to check in case the past year is entered for Thread Pool and Month End scraper
- Added timezone parameter for Thread Pool and Month End scrapers so that they check the past date based on the entered timezone, mostly for fixing timezone problems when testing using GitHub Action
  • Loading branch information
sakan811 committed Jun 13, 2024
1 parent 4b64a85 commit 825e526
Show file tree
Hide file tree
Showing 3 changed files with 22 additions and 12 deletions.
13 changes: 9 additions & 4 deletions japan_avg_hotel_price_finder/scrape_until_month_end.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@
from japan_avg_hotel_price_finder.utils import check_if_current_date_has_passed
from set_details import Details


logger = configure_logging_with_file('jp_hotel_data.log', 'jp_hotel_data')


Expand All @@ -39,11 +38,13 @@ def __init__(self, details: Details):
self.year = details.year
self.nights = details.nights

def scrape_until_month_end(self, to_sqlite: bool = False) -> None | pd.DataFrame:
def scrape_until_month_end(self, to_sqlite: bool = False, timezone=None) -> None | pd.DataFrame:
"""
Scrape hotel data (hotel name, room price, review score)
starting from a given start day until the end of the same month.
:param to_sqlite: If True, save the scraped data to a SQLite database, else save to CSV.
:param timezone: Set timezone.
Default is None.
:return: None or a Pandas DataFrame.
"""
logger.info(f'Scraping data from {self.start_day}-{calendar.month_name[self.month]}-{self.year} '
Expand All @@ -60,7 +61,10 @@ def scrape_until_month_end(self, to_sqlite: bool = False) -> None | pd.DataFrame

df_list = []

today = datetime.today()
if timezone is not None:
today = datetime.now(timezone)
else:
today = datetime.today()

if self.year < today.year:
logger.warning(f'The current year to scrape has passed. Skip {self.year}.')
Expand Down Expand Up @@ -89,7 +93,8 @@ def scrape_until_month_end(self, to_sqlite: bool = False) -> None | pd.DataFrame
else:
check_in = current_date.strftime('%Y-%m-%d')
check_out = (current_date + timedelta(days=self.nights)).strftime('%Y-%m-%d')
logger.info(f'Scrape data for {self.nights} nights. Check-in: {check_in}, Check-out: {check_out}')
logger.info(
f'Scrape data for {self.nights} nights. Check-in: {check_in}, Check-out: {check_out}')

current_date += timedelta(days=1)
logger.debug(f'Current date: {current_date.strftime("%Y-%m-%d")}')
Expand Down
9 changes: 7 additions & 2 deletions japan_avg_hotel_price_finder/thread_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -34,10 +34,12 @@ def __init__(self, details: Details):
"""
super().__init__(details)

def thread_scrape(self, to_sqlite: bool = False, max_workers: int = 9) -> None | pd.DataFrame:
def thread_scrape(self, to_sqlite: bool = False, timezone=None, max_workers: int = 9) -> None | pd.DataFrame:
"""
Scrape hotel data from the start day to the end of the same month using Thread Pool executor.
:param to_sqlite: If True, save the scraped data to a SQLite database, else save it to CSV.
:param timezone: Set timezone.
Default is None.
:param max_workers: Maximum number of threads to use.
Default is 9.
:return: None or a Pandas dataframe.
Expand All @@ -50,7 +52,10 @@ def thread_scrape(self, to_sqlite: bool = False, max_workers: int = 9) -> None |
# Define a list to store the result DataFrame from each thread
results = []

today = datetime.today()
if timezone is not None:
today = datetime.now(timezone)
else:
today = datetime.today()

if self.year < today.year:
logger.warning(f'The current year to scrape has passed. Skip {self.year}.')
Expand Down
12 changes: 6 additions & 6 deletions tests/test_scrapers.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def test_check_if_all_date_was_scraped_csv() -> None:
)

thread_scrape = ThreadPoolScraper(hotel_stay)
thread_scrape.thread_scrape()
thread_scrape.thread_scrape(timezone=city_timezone)
check_csv_if_all_date_was_scraped()

with sqlite3.connect(sqlite_name) as conn:
Expand Down Expand Up @@ -102,7 +102,7 @@ def test_thread_scraper() -> None:
)

thread_scrape = ThreadPoolScraper(hotel_stay)
df = thread_scrape.thread_scrape()
df = thread_scrape.thread_scrape(timezone=city_timezone)

assert not df.empty

Expand Down Expand Up @@ -133,7 +133,7 @@ def test_thread_scraper_past_month() -> None:
)

thread_scrape = ThreadPoolScraper(hotel_stay)
df = thread_scrape.thread_scrape()
df = thread_scrape.thread_scrape(timezone=city_timezone)

assert df is None

Expand Down Expand Up @@ -165,7 +165,7 @@ def test_until_month_end_scraper() -> None:
)

month_end = MonthEndBasicScraper(hotel_stay)
df = month_end.scrape_until_month_end()
df = month_end.scrape_until_month_end(timezone=city_timezone)

assert not df.empty

Expand Down Expand Up @@ -197,7 +197,7 @@ def test_until_month_end_scraper_past_month() -> None:
)

month_end = MonthEndBasicScraper(hotel_stay)
df = month_end.scrape_until_month_end()
df = month_end.scrape_until_month_end(timezone=city_timezone)

assert df is None

Expand Down Expand Up @@ -267,7 +267,7 @@ def test_check_if_all_date_was_scraped() -> None:
)

thread_scrape = ThreadPoolScraper(hotel_stay)
thread_scrape.thread_scrape(to_sqlite=True)
thread_scrape.thread_scrape(to_sqlite=True, timezone=city_timezone)
check_db_if_all_date_was_scraped(sqlite_name)

with sqlite3.connect(sqlite_name) as conn:
Expand Down

0 comments on commit 825e526

Please sign in to comment.