From 0c580695d5f34f58f4fe1994fb8ccd43d96ccfd0 Mon Sep 17 00:00:00 2001 From: Vlada Dusek Date: Fri, 25 Oct 2024 15:48:38 +0200 Subject: [PATCH] chore: minor update Scrapy template and wrapper (#299) --- .../python-beautifulsoup/requirements.txt | 2 +- .../requirements.txt | 2 +- .../requirements.txt | 2 +- templates/python-empty/requirements.txt | 2 +- templates/python-playwright/requirements.txt | 2 +- templates/python-scrapy/requirements.txt | 2 +- templates/python-scrapy/src/__main__.py | 22 ++++---- templates/python-scrapy/src/main.py | 14 ++--- templates/python-selenium/requirements.txt | 2 +- templates/python-standby/requirements.txt | 2 +- templates/python-start/requirements.txt | 2 +- .../{projectFolder}/__main__.template.py | 28 ++++++---- .../{projectFolder}/main.template.py | 56 +++---------------- 13 files changed, 52 insertions(+), 86 deletions(-) diff --git a/templates/python-beautifulsoup/requirements.txt b/templates/python-beautifulsoup/requirements.txt index d37f42b7..0aacbc06 100644 --- a/templates/python-beautifulsoup/requirements.txt +++ b/templates/python-beautifulsoup/requirements.txt @@ -1,7 +1,7 @@ # Feel free to add your Python dependencies below. For formatting guidelines, see: # https://pip.pypa.io/en/latest/reference/requirements-file-format/ -apify == 2.0.0 +apify ~= 2.0.0 beautifulsoup4[lxml] httpx types-beautifulsoup4 diff --git a/templates/python-crawlee-beautifulsoup/requirements.txt b/templates/python-crawlee-beautifulsoup/requirements.txt index 54de46a4..8c2f2652 100644 --- a/templates/python-crawlee-beautifulsoup/requirements.txt +++ b/templates/python-crawlee-beautifulsoup/requirements.txt @@ -1,5 +1,5 @@ # Feel free to add your Python dependencies below. For formatting guidelines, see: # https://pip.pypa.io/en/latest/reference/requirements-file-format/ -apify == 2.0.0 +apify ~= 2.0.0 crawlee[beautifulsoup] diff --git a/templates/python-crawlee-playwright/requirements.txt b/templates/python-crawlee-playwright/requirements.txt index bf947966..3e1108e6 100644 --- a/templates/python-crawlee-playwright/requirements.txt +++ b/templates/python-crawlee-playwright/requirements.txt @@ -1,5 +1,5 @@ # Feel free to add your Python dependencies below. For formatting guidelines, see: # https://pip.pypa.io/en/latest/reference/requirements-file-format/ -apify == 2.0.0 +apify ~= 2.0.0 crawlee[playwright] diff --git a/templates/python-empty/requirements.txt b/templates/python-empty/requirements.txt index 345704d4..6d3daa18 100644 --- a/templates/python-empty/requirements.txt +++ b/templates/python-empty/requirements.txt @@ -1,4 +1,4 @@ # Feel free to add your Python dependencies below. For formatting guidelines, see: # https://pip.pypa.io/en/latest/reference/requirements-file-format/ -apify == 2.0.0 +apify ~= 2.0.0 diff --git a/templates/python-playwright/requirements.txt b/templates/python-playwright/requirements.txt index d3853438..e59cbcf7 100644 --- a/templates/python-playwright/requirements.txt +++ b/templates/python-playwright/requirements.txt @@ -1,5 +1,5 @@ # Feel free to add your Python dependencies below. For formatting guidelines, see: # https://pip.pypa.io/en/latest/reference/requirements-file-format/ -apify == 2.0.0 +apify ~= 2.0.0 playwright diff --git a/templates/python-scrapy/requirements.txt b/templates/python-scrapy/requirements.txt index f40379fa..db0f75d0 100644 --- a/templates/python-scrapy/requirements.txt +++ b/templates/python-scrapy/requirements.txt @@ -1,6 +1,6 @@ # Feel free to add your Python dependencies below. For formatting guidelines, see: # https://pip.pypa.io/en/latest/reference/requirements-file-format/ -apify[scrapy] == 2.0.0 +apify[scrapy] ~= 2.0.0 nest-asyncio scrapy diff --git a/templates/python-scrapy/src/__main__.py b/templates/python-scrapy/src/__main__.py index a86d7956..26be8ddf 100644 --- a/templates/python-scrapy/src/__main__.py +++ b/templates/python-scrapy/src/__main__.py @@ -36,8 +36,7 @@ def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamHandler) -> None: - """ - Configure a logger with the specified settings. + """Configure a logger with the specified settings. Args: logger_name: The name of the logger to be configured. @@ -66,7 +65,8 @@ def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamH def new_configure_logging(*args: Any, **kwargs: Any) -> None: - """ + """Configure logging for Scrapy and root loggers to ensure consistent logging behavior. + We need to manually configure both the root logger and all Scrapy-associated loggers. Configuring only the root logger is not sufficient, as Scrapy will override it with its own settings. Scrapy uses these four primary loggers - https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77. Therefore, we configure here @@ -91,22 +91,24 @@ def new_configure_logging(*args: Any, **kwargs: Any) -> None: scrapy_logging.configure_logging = new_configure_logging -# Now we can do the rest of the setup +# Now we can do the rest of the setup. import asyncio import os import nest_asyncio from scrapy.utils.reactor import install_reactor from .main import main -# To ensure seamless compatibility between asynchronous libraries Twisted (used by Scrapy) and AsyncIO (used by Apify), -# it is highly recommended to use AsyncioSelectorReactor as the Twisted reactor -# The reactor installation must be done manually before calling `nest_asyncio.apply()`, -# otherwise, it will not work correctly on Windows. +# For compatibility between Twisted (used by Scrapy) and AsyncIO (used by Apify) asynchronous libraries, it is +# necessary to set the Twisted reactor to `AsyncioSelectorReactor`. This setup allows the two asynchronous libraries +# to work together. +# +# Note: The reactor must be installed before applying `nest_asyncio.apply()`, otherwise, it will not work correctly +# on Windows. install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor') nest_asyncio.apply() -# Specify the path to the Scrapy project settings module +# Specify the path to the Scrapy project settings module. os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings' -# Run the Apify main coroutine +# Run the Apify main coroutine in the event loop. asyncio.run(main()) diff --git a/templates/python-scrapy/src/main.py b/templates/python-scrapy/src/main.py index 34442676..30aa40c0 100644 --- a/templates/python-scrapy/src/main.py +++ b/templates/python-scrapy/src/main.py @@ -26,21 +26,19 @@ from apify import Actor from apify.scrapy.utils import apply_apify_settings -# Import your Scrapy spider here +# Import your Scrapy spider here. from .spiders.title import TitleSpider as Spider -# Default input values for local execution using `apify run` +# Default input values for local execution using `apify run`. LOCAL_DEFAULT_START_URLS = [{'url': 'https://apify.com'}] async def main() -> None: - """ - Apify Actor main coroutine for executing the Scrapy spider. - """ + """Apify Actor main coroutine for executing the Scrapy spider.""" async with Actor: Actor.log.info('Actor is being executed...') - # Process Actor input + # Retrieve and process Actor input. actor_input = await Actor.get_input() or {} start_urls = actor_input.get('startUrls', LOCAL_DEFAULT_START_URLS) proxy_config = actor_input.get('proxyConfiguration') @@ -53,10 +51,10 @@ async def main() -> None: url = start_url.get('url') await request_queue.add_request(url) - # Apply Apify settings, it will override the Scrapy project settings + # Apply Apify settings, it will override the Scrapy project settings. settings = apply_apify_settings(proxy_config=proxy_config) - # Execute the spider using Scrapy CrawlerProcess + # Execute the spider using Scrapy `CrawlerProcess`. process = CrawlerProcess(settings, install_root_handler=False) process.crawl(Spider) process.start() diff --git a/templates/python-selenium/requirements.txt b/templates/python-selenium/requirements.txt index e35b8635..e66d21ea 100644 --- a/templates/python-selenium/requirements.txt +++ b/templates/python-selenium/requirements.txt @@ -1,5 +1,5 @@ # Feel free to add your Python dependencies below. For formatting guidelines, see: # https://pip.pypa.io/en/latest/reference/requirements-file-format/ -apify == 2.0.0 +apify ~= 2.0.0 selenium diff --git a/templates/python-standby/requirements.txt b/templates/python-standby/requirements.txt index 345704d4..6d3daa18 100644 --- a/templates/python-standby/requirements.txt +++ b/templates/python-standby/requirements.txt @@ -1,4 +1,4 @@ # Feel free to add your Python dependencies below. For formatting guidelines, see: # https://pip.pypa.io/en/latest/reference/requirements-file-format/ -apify == 2.0.0 +apify ~= 2.0.0 diff --git a/templates/python-start/requirements.txt b/templates/python-start/requirements.txt index d37f42b7..0aacbc06 100644 --- a/templates/python-start/requirements.txt +++ b/templates/python-start/requirements.txt @@ -1,7 +1,7 @@ # Feel free to add your Python dependencies below. For formatting guidelines, see: # https://pip.pypa.io/en/latest/reference/requirements-file-format/ -apify == 2.0.0 +apify ~= 2.0.0 beautifulsoup4[lxml] httpx types-beautifulsoup4 diff --git a/wrappers/python-scrapy/{projectFolder}/__main__.template.py b/wrappers/python-scrapy/{projectFolder}/__main__.template.py index 715c439a..2effe97e 100644 --- a/wrappers/python-scrapy/{projectFolder}/__main__.template.py +++ b/wrappers/python-scrapy/{projectFolder}/__main__.template.py @@ -20,9 +20,9 @@ from apify.log import ActorLogFormatter # Define names of the loggers. -APIFY_LOGGER_NAMES = ['apify', 'apify_client'] -SCRAPY_LOGGER_NAMES = ['filelock', 'hpack', 'httpx', 'scrapy', 'twisted'] -ALL_LOGGER_NAMES = APIFY_LOGGER_NAMES + SCRAPY_LOGGER_NAMES +MAIN_LOGGER_NAMES = ['apify', 'apify_client', 'scrapy'] +OTHER_LOGGER_NAMES = ['filelock', 'hpack', 'httpcore', 'httpx', 'protego', 'twisted'] +ALL_LOGGER_NAMES = MAIN_LOGGER_NAMES + OTHER_LOGGER_NAMES # To change the logging level, modify the `LOG_LEVEL` field in `settings.py`. If the field is not present in the file, # Scrapy will default to `DEBUG`. This setting applies to all loggers. If you wish to change the logging level for @@ -36,8 +36,7 @@ def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamHandler) -> None: - """ - Configure a logger with the specified settings. + """Configure a logger with the specified settings. Args: logger_name: The name of the logger to be configured. @@ -54,7 +53,7 @@ def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamH # Apify loggers have to be set up here and in the `new_configure_logging` as well to be able to use them both from # the `main.py` and Scrapy components. -for logger_name in APIFY_LOGGER_NAMES: +for logger_name in MAIN_LOGGER_NAMES: configure_logger(logger_name, LOGGING_LEVEL, apify_handler) # We can't attach our log handler to the loggers normally, because Scrapy would remove them in the `configure_logging` @@ -66,7 +65,8 @@ def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamH def new_configure_logging(*args: Any, **kwargs: Any) -> None: - """ + """Configure logging for Scrapy and root loggers to ensure consistent logging behavior. + We need to manually configure both the root logger and all Scrapy-associated loggers. Configuring only the root logger is not sufficient, as Scrapy will override it with its own settings. Scrapy uses these four primary loggers - https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77. Therefore, we configure here @@ -91,20 +91,24 @@ def new_configure_logging(*args: Any, **kwargs: Any) -> None: scrapy_logging.configure_logging = new_configure_logging -# Now we can do the rest of the setup +# Now we can do the rest of the setup. import asyncio import os import nest_asyncio from scrapy.utils.reactor import install_reactor from .main import main -# To ensure seamless compatibility between asynchronous libraries Twisted (used by Scrapy) and AsyncIO (used by Apify), -# it is highly recommended to use AsyncioSelectorReactor as the Twisted reactor +# For compatibility between Twisted (used by Scrapy) and AsyncIO (used by Apify) asynchronous libraries, it is +# necessary to set the Twisted reactor to `AsyncioSelectorReactor`. This setup allows the two asynchronous libraries +# to work together. +# +# Note: The reactor must be installed before applying `nest_asyncio.apply()`, otherwise, it will not work correctly +# on Windows. install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor') nest_asyncio.apply() -# Specify the path to the Scrapy project settings module +# Specify the path to the Scrapy project settings module. os.environ['SCRAPY_SETTINGS_MODULE'] = '{{scrapy_settings_module}}' -# Run the Apify main coroutine +# Run the Apify main coroutine in the event loop. asyncio.run(main()) diff --git a/wrappers/python-scrapy/{projectFolder}/main.template.py b/wrappers/python-scrapy/{projectFolder}/main.template.py index 73248ef8..558783f9 100644 --- a/wrappers/python-scrapy/{projectFolder}/main.template.py +++ b/wrappers/python-scrapy/{projectFolder}/main.template.py @@ -22,64 +22,26 @@ from __future__ import annotations from scrapy.crawler import CrawlerProcess -from scrapy.settings import Settings -from scrapy.utils.project import get_project_settings from apify import Actor +from apify.scrapy.utils import apply_apify_settings -# Import your Scrapy spider here +# Import your Scrapy spider here. from {{spider_module_name}} import {{spider_class_name}} as Spider -# Default input values for local execution using `apify run` +# Default input values for local execution using `apify run`. LOCAL_DEFAULT_START_URLS = [{'url': 'https://apify.com'}] -def _get_scrapy_settings(proxy_cfg: dict | None = None) -> Settings: - """ - Get Scrapy project settings with custom configurations. - - You can add your own Scrapy components either in this function or in your `settings.py`. - - Returns: - Scrapy project settings with custom configurations. - """ - settings = get_project_settings() - - # Use ApifyScheduler as the scheduler - settings['SCHEDULER'] = 'apify.scrapy.scheduler.ApifyScheduler' - - # Add the ActorDatasetPushPipeline into the item pipelines, assigning it the highest integer (1000), - # ensuring it is executed as the final step in the pipeline sequence - settings['ITEM_PIPELINES']['apify.scrapy.pipelines.ActorDatasetPushPipeline'] = 1000 - - # Disable the default RobotsTxtMiddleware, Apify's custom scheduler already handles robots.txt - settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware'] = None - - # Disable the default HttpProxyMiddleware and add ApifyHttpProxyMiddleware - settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware'] = None - settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyHttpProxyMiddleware'] = 950 - - # Disable the default RetryMiddleware and add ApifyRetryMiddleware with the highest integer (1000) - settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.retry.RetryMiddleware'] = None - settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyRetryMiddleware'] = 1000 - - # Store the proxy configuration - settings['APIFY_PROXY_SETTINGS'] = proxy_cfg - - return settings - - async def main() -> None: - """ - Apify Actor main coroutine for executing the Scrapy spider. - """ + """Apify Actor main coroutine for executing the Scrapy spider.""" async with Actor: Actor.log.info('Actor is being executed...') - # Process Actor input + # Retrieve and process Actor input. actor_input = await Actor.get_input() or {} start_urls = actor_input.get('startUrls', LOCAL_DEFAULT_START_URLS) - proxy_configuration = actor_input.get('proxyConfiguration') + proxy_config = actor_input.get('proxyConfiguration') # Open the default request queue for handling URLs to be processed. request_queue = await Actor.open_request_queue() @@ -89,10 +51,10 @@ async def main() -> None: url = start_url.get('url') await request_queue.add_request(url) - # Get Scrapy project settings with custom configurations - settings = _get_scrapy_settings(proxy_configuration) + # Apply Apify settings, it will override the Scrapy project settings. + settings = apply_apify_settings(proxy_config=proxy_config) - # Execute the spider using Scrapy CrawlerProcess + # Execute the spider using Scrapy `CrawlerProcess`. process = CrawlerProcess(settings, install_root_handler=False) process.crawl(Spider) process.start()