|
1 | | -import os |
2 | | -import sys |
3 | | -import logging |
4 | | -import asyncio |
5 | | -from logging.handlers import RotatingFileHandler |
6 | | -from excel_scraper import NYCInfoHubScraper |
7 | | - |
8 | | -# Ensure stdout is line-buffered so logs appear in real time |
9 | | -sys.stdout.reconfigure(line_buffering=True) |
10 | | - |
11 | | -# -------------------- SCRAPER EXECUTION -------------------- |
12 | | -async def main(): |
13 | | - """ |
14 | | - Main entry point for running the NYCInfoHubScraper. |
15 | | - Delegates the entire scraping workflow to the scraper's scrape_data(). |
16 | | - """ |
17 | | - scraper = NYCInfoHubScraper() |
18 | | - try: |
19 | | - # The new refactored pipeline is entirely within scrape_data() |
20 | | - await scraper.scrape_data() |
21 | | - except Exception as e: |
22 | | - logging.error(f"Some error occurred: {e}", exc_info=True) |
23 | | - return 1 # Non-zero exit code indicates an error |
24 | | - finally: |
25 | | - # Clean up Selenium & httpx |
26 | | - await scraper.close() |
27 | | - |
28 | | - return 0 # Signals success to the caller |
29 | | - |
30 | | -# Run scraper process if script is executed directly |
31 | | -if __name__ == "__main__": |
32 | | - base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) |
33 | | - logs_dir = os.path.join(base_dir, "logs") |
34 | | - os.makedirs(logs_dir, exist_ok=True) |
35 | | - |
36 | | - # Create rotating log handler |
37 | | - log_file_path = os.path.join(logs_dir, "excel_fetch.log") |
38 | | - rotating_handler = RotatingFileHandler( |
39 | | - log_file_path, |
40 | | - maxBytes=5_242_880, # ~5 MB |
41 | | - backupCount=2, |
42 | | - encoding="utf-8" |
43 | | - ) |
44 | | - rotating_handler.setFormatter(logging.Formatter( |
45 | | - "%(asctime)s - %(levelname)s - %(message)s" |
46 | | - )) |
47 | | - |
48 | | - # Configure logging with both file & console handlers |
49 | | - logging.basicConfig( |
50 | | - level=logging.INFO, |
51 | | - format="%(asctime)s - %(levelname)s - %(message)s", |
52 | | - handlers=[rotating_handler, logging.StreamHandler()], |
53 | | - force=True |
54 | | - ) |
55 | | - |
56 | | - try: |
57 | | - exit_code = asyncio.run(main()) |
58 | | - sys.exit(exit_code) |
59 | | - except Exception as e: |
60 | | - logging.error(f"Script failed: {e}", exc_info=True) |
61 | | - sys.exit(1) |
| 1 | +import os |
| 2 | +import sys |
| 3 | +import logging |
| 4 | +import asyncio |
| 5 | +from logging.handlers import RotatingFileHandler |
| 6 | +from src.excel_scraper.scraper import NYCInfoHubScraper |
| 7 | + |
| 8 | +# Ensure stdout is line-buffered so logs appear in real time |
| 9 | +sys.stdout.reconfigure(line_buffering=True) |
| 10 | + |
| 11 | +# -------------------- SCRAPER EXECUTION -------------------- |
| 12 | +async def main(): |
| 13 | + """ |
| 14 | + Main entry point for running the NYCInfoHubScraper. |
| 15 | + Delegates the entire scraping workflow to the scraper's scrape_data(). |
| 16 | + """ |
| 17 | + scraper = NYCInfoHubScraper() |
| 18 | + try: |
| 19 | + # The new refactored pipeline is entirely within scrape_data() |
| 20 | + await scraper.scrape_data() |
| 21 | + except Exception as e: |
| 22 | + logging.error(f"Some error occurred: {e}", exc_info=True) |
| 23 | + return 1 # Non-zero exit code indicates an error |
| 24 | + finally: |
| 25 | + # Clean up Selenium & httpx |
| 26 | + await scraper.close() |
| 27 | + |
| 28 | + return 0 # Signals success to the caller |
| 29 | + |
| 30 | +# Run scraper process if script is executed directly |
| 31 | +if __name__ == "__main__": |
| 32 | + base_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) |
| 33 | + logs_dir = os.path.join(base_dir, "logs") |
| 34 | + os.makedirs(logs_dir, exist_ok=True) |
| 35 | + |
| 36 | + # Create rotating log handler |
| 37 | + log_file_path = os.path.join(logs_dir, "excel_fetch.log") |
| 38 | + rotating_handler = RotatingFileHandler( |
| 39 | + log_file_path, |
| 40 | + maxBytes=5_242_880, # ~5 MB |
| 41 | + backupCount=2, |
| 42 | + encoding="utf-8" |
| 43 | + ) |
| 44 | + rotating_handler.setFormatter(logging.Formatter( |
| 45 | + "%(asctime)s - %(levelname)s - %(message)s" |
| 46 | + )) |
| 47 | + |
| 48 | + # Configure logging with both file & console handlers |
| 49 | + logging.basicConfig( |
| 50 | + level=logging.INFO, |
| 51 | + format="%(asctime)s - %(levelname)s - %(message)s", |
| 52 | + handlers=[rotating_handler, logging.StreamHandler()], |
| 53 | + force=True |
| 54 | + ) |
| 55 | + |
| 56 | + try: |
| 57 | + exit_code = asyncio.run(main()) |
| 58 | + sys.exit(exit_code) |
| 59 | + except Exception as e: |
| 60 | + logging.error(f"Script failed: {e}", exc_info=True) |
| 61 | + sys.exit(1) |
0 commit comments