From 0c580695d5f34f58f4fe1994fb8ccd43d96ccfd0 Mon Sep 17 00:00:00 2001
From: Vlada Dusek <v.dusek96@gmail.com>
Date: Fri, 25 Oct 2024 15:48:38 +0200
Subject: [PATCH] chore: minor update Scrapy template and wrapper (#299)

---
 .../python-beautifulsoup/requirements.txt     |  2 +-
 .../requirements.txt                          |  2 +-
 .../requirements.txt                          |  2 +-
 templates/python-empty/requirements.txt       |  2 +-
 templates/python-playwright/requirements.txt  |  2 +-
 templates/python-scrapy/requirements.txt      |  2 +-
 templates/python-scrapy/src/__main__.py       | 22 ++++----
 templates/python-scrapy/src/main.py           | 14 ++---
 templates/python-selenium/requirements.txt    |  2 +-
 templates/python-standby/requirements.txt     |  2 +-
 templates/python-start/requirements.txt       |  2 +-
 .../{projectFolder}/__main__.template.py      | 28 ++++++----
 .../{projectFolder}/main.template.py          | 56 +++----------------
 13 files changed, 52 insertions(+), 86 deletions(-)

diff --git a/templates/python-beautifulsoup/requirements.txt b/templates/python-beautifulsoup/requirements.txt
index d37f42b7..0aacbc06 100644
--- a/templates/python-beautifulsoup/requirements.txt
+++ b/templates/python-beautifulsoup/requirements.txt
@@ -1,7 +1,7 @@
 # Feel free to add your Python dependencies below. For formatting guidelines, see:
 # https://pip.pypa.io/en/latest/reference/requirements-file-format/
 
-apify == 2.0.0
+apify ~= 2.0.0
 beautifulsoup4[lxml]
 httpx
 types-beautifulsoup4
diff --git a/templates/python-crawlee-beautifulsoup/requirements.txt b/templates/python-crawlee-beautifulsoup/requirements.txt
index 54de46a4..8c2f2652 100644
--- a/templates/python-crawlee-beautifulsoup/requirements.txt
+++ b/templates/python-crawlee-beautifulsoup/requirements.txt
@@ -1,5 +1,5 @@
 # Feel free to add your Python dependencies below. For formatting guidelines, see:
 # https://pip.pypa.io/en/latest/reference/requirements-file-format/
 
-apify == 2.0.0
+apify ~= 2.0.0
 crawlee[beautifulsoup]
diff --git a/templates/python-crawlee-playwright/requirements.txt b/templates/python-crawlee-playwright/requirements.txt
index bf947966..3e1108e6 100644
--- a/templates/python-crawlee-playwright/requirements.txt
+++ b/templates/python-crawlee-playwright/requirements.txt
@@ -1,5 +1,5 @@
 # Feel free to add your Python dependencies below. For formatting guidelines, see:
 # https://pip.pypa.io/en/latest/reference/requirements-file-format/
 
-apify == 2.0.0
+apify ~= 2.0.0
 crawlee[playwright]
diff --git a/templates/python-empty/requirements.txt b/templates/python-empty/requirements.txt
index 345704d4..6d3daa18 100644
--- a/templates/python-empty/requirements.txt
+++ b/templates/python-empty/requirements.txt
@@ -1,4 +1,4 @@
 # Feel free to add your Python dependencies below. For formatting guidelines, see:
 # https://pip.pypa.io/en/latest/reference/requirements-file-format/
 
-apify == 2.0.0
+apify ~= 2.0.0
diff --git a/templates/python-playwright/requirements.txt b/templates/python-playwright/requirements.txt
index d3853438..e59cbcf7 100644
--- a/templates/python-playwright/requirements.txt
+++ b/templates/python-playwright/requirements.txt
@@ -1,5 +1,5 @@
 # Feel free to add your Python dependencies below. For formatting guidelines, see:
 # https://pip.pypa.io/en/latest/reference/requirements-file-format/
 
-apify == 2.0.0
+apify ~= 2.0.0
 playwright
diff --git a/templates/python-scrapy/requirements.txt b/templates/python-scrapy/requirements.txt
index f40379fa..db0f75d0 100644
--- a/templates/python-scrapy/requirements.txt
+++ b/templates/python-scrapy/requirements.txt
@@ -1,6 +1,6 @@
 # Feel free to add your Python dependencies below. For formatting guidelines, see:
 # https://pip.pypa.io/en/latest/reference/requirements-file-format/
 
-apify[scrapy] == 2.0.0
+apify[scrapy] ~= 2.0.0
 nest-asyncio
 scrapy
diff --git a/templates/python-scrapy/src/__main__.py b/templates/python-scrapy/src/__main__.py
index a86d7956..26be8ddf 100644
--- a/templates/python-scrapy/src/__main__.py
+++ b/templates/python-scrapy/src/__main__.py
@@ -36,8 +36,7 @@
 
 
 def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamHandler) -> None:
-    """
-    Configure a logger with the specified settings.
+    """Configure a logger with the specified settings.
 
     Args:
         logger_name: The name of the logger to be configured.
@@ -66,7 +65,8 @@ def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamH
 
 
 def new_configure_logging(*args: Any, **kwargs: Any) -> None:
-    """
+    """Configure logging for Scrapy and root loggers to ensure consistent logging behavior.
+
     We need to manually configure both the root logger and all Scrapy-associated loggers. Configuring only the root
     logger is not sufficient, as Scrapy will override it with its own settings. Scrapy uses these four primary
     loggers - https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77. Therefore, we configure here
@@ -91,22 +91,24 @@ def new_configure_logging(*args: Any, **kwargs: Any) -> None:
 
 scrapy_logging.configure_logging = new_configure_logging
 
-# Now we can do the rest of the setup
+# Now we can do the rest of the setup.
 import asyncio
 import os
 import nest_asyncio
 from scrapy.utils.reactor import install_reactor
 from .main import main
 
-# To ensure seamless compatibility between asynchronous libraries Twisted (used by Scrapy) and AsyncIO (used by Apify),
-# it is highly recommended to use AsyncioSelectorReactor as the Twisted reactor
-# The reactor installation must be done manually before calling `nest_asyncio.apply()`,
-# otherwise, it will not work correctly on Windows.
+# For compatibility between Twisted (used by Scrapy) and AsyncIO (used by Apify) asynchronous libraries, it is
+# necessary to set the Twisted reactor to `AsyncioSelectorReactor`. This setup allows the two asynchronous libraries
+# to work together.
+#
+# Note: The reactor must be installed before applying `nest_asyncio.apply()`, otherwise, it will not work correctly
+# on Windows.
 install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
 nest_asyncio.apply()
 
-# Specify the path to the Scrapy project settings module
+# Specify the path to the Scrapy project settings module.
 os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings'
 
-# Run the Apify main coroutine
+# Run the Apify main coroutine in the event loop.
 asyncio.run(main())
diff --git a/templates/python-scrapy/src/main.py b/templates/python-scrapy/src/main.py
index 34442676..30aa40c0 100644
--- a/templates/python-scrapy/src/main.py
+++ b/templates/python-scrapy/src/main.py
@@ -26,21 +26,19 @@
 from apify import Actor
 from apify.scrapy.utils import apply_apify_settings
 
-# Import your Scrapy spider here
+# Import your Scrapy spider here.
 from .spiders.title import TitleSpider as Spider
 
-# Default input values for local execution using `apify run`
+# Default input values for local execution using `apify run`.
 LOCAL_DEFAULT_START_URLS = [{'url': 'https://apify.com'}]
 
 
 async def main() -> None:
-    """
-    Apify Actor main coroutine for executing the Scrapy spider.
-    """
+    """Apify Actor main coroutine for executing the Scrapy spider."""
     async with Actor:
         Actor.log.info('Actor is being executed...')
 
-        # Process Actor input
+        # Retrieve and process Actor input.
         actor_input = await Actor.get_input() or {}
         start_urls = actor_input.get('startUrls', LOCAL_DEFAULT_START_URLS)
         proxy_config = actor_input.get('proxyConfiguration')
@@ -53,10 +51,10 @@ async def main() -> None:
             url = start_url.get('url')
             await request_queue.add_request(url)
 
-        # Apply Apify settings, it will override the Scrapy project settings
+        # Apply Apify settings, it will override the Scrapy project settings.
         settings = apply_apify_settings(proxy_config=proxy_config)
 
-        # Execute the spider using Scrapy CrawlerProcess
+        # Execute the spider using Scrapy `CrawlerProcess`.
         process = CrawlerProcess(settings, install_root_handler=False)
         process.crawl(Spider)
         process.start()
diff --git a/templates/python-selenium/requirements.txt b/templates/python-selenium/requirements.txt
index e35b8635..e66d21ea 100644
--- a/templates/python-selenium/requirements.txt
+++ b/templates/python-selenium/requirements.txt
@@ -1,5 +1,5 @@
 # Feel free to add your Python dependencies below. For formatting guidelines, see:
 # https://pip.pypa.io/en/latest/reference/requirements-file-format/
 
-apify == 2.0.0
+apify ~= 2.0.0
 selenium
diff --git a/templates/python-standby/requirements.txt b/templates/python-standby/requirements.txt
index 345704d4..6d3daa18 100644
--- a/templates/python-standby/requirements.txt
+++ b/templates/python-standby/requirements.txt
@@ -1,4 +1,4 @@
 # Feel free to add your Python dependencies below. For formatting guidelines, see:
 # https://pip.pypa.io/en/latest/reference/requirements-file-format/
 
-apify == 2.0.0
+apify ~= 2.0.0
diff --git a/templates/python-start/requirements.txt b/templates/python-start/requirements.txt
index d37f42b7..0aacbc06 100644
--- a/templates/python-start/requirements.txt
+++ b/templates/python-start/requirements.txt
@@ -1,7 +1,7 @@
 # Feel free to add your Python dependencies below. For formatting guidelines, see:
 # https://pip.pypa.io/en/latest/reference/requirements-file-format/
 
-apify == 2.0.0
+apify ~= 2.0.0
 beautifulsoup4[lxml]
 httpx
 types-beautifulsoup4
diff --git a/wrappers/python-scrapy/{projectFolder}/__main__.template.py b/wrappers/python-scrapy/{projectFolder}/__main__.template.py
index 715c439a..2effe97e 100644
--- a/wrappers/python-scrapy/{projectFolder}/__main__.template.py
+++ b/wrappers/python-scrapy/{projectFolder}/__main__.template.py
@@ -20,9 +20,9 @@
 from apify.log import ActorLogFormatter
 
 # Define names of the loggers.
-APIFY_LOGGER_NAMES = ['apify', 'apify_client']
-SCRAPY_LOGGER_NAMES = ['filelock', 'hpack', 'httpx', 'scrapy', 'twisted']
-ALL_LOGGER_NAMES = APIFY_LOGGER_NAMES + SCRAPY_LOGGER_NAMES
+MAIN_LOGGER_NAMES = ['apify', 'apify_client', 'scrapy']
+OTHER_LOGGER_NAMES = ['filelock', 'hpack', 'httpcore', 'httpx', 'protego', 'twisted']
+ALL_LOGGER_NAMES = MAIN_LOGGER_NAMES + OTHER_LOGGER_NAMES
 
 # To change the logging level, modify the `LOG_LEVEL` field in `settings.py`. If the field is not present in the file,
 # Scrapy will default to `DEBUG`. This setting applies to all loggers. If you wish to change the logging level for
@@ -36,8 +36,7 @@
 
 
 def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamHandler) -> None:
-    """
-    Configure a logger with the specified settings.
+    """Configure a logger with the specified settings.
 
     Args:
         logger_name: The name of the logger to be configured.
@@ -54,7 +53,7 @@ def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamH
 
 # Apify loggers have to be set up here and in the `new_configure_logging` as well to be able to use them both from
 # the `main.py` and Scrapy components.
-for logger_name in APIFY_LOGGER_NAMES:
+for logger_name in MAIN_LOGGER_NAMES:
     configure_logger(logger_name, LOGGING_LEVEL, apify_handler)
 
 # We can't attach our log handler to the loggers normally, because Scrapy would remove them in the `configure_logging`
@@ -66,7 +65,8 @@ def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamH
 
 
 def new_configure_logging(*args: Any, **kwargs: Any) -> None:
-    """
+    """Configure logging for Scrapy and root loggers to ensure consistent logging behavior.
+
     We need to manually configure both the root logger and all Scrapy-associated loggers. Configuring only the root
     logger is not sufficient, as Scrapy will override it with its own settings. Scrapy uses these four primary
     loggers - https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77. Therefore, we configure here
@@ -91,20 +91,24 @@ def new_configure_logging(*args: Any, **kwargs: Any) -> None:
 
 scrapy_logging.configure_logging = new_configure_logging
 
-# Now we can do the rest of the setup
+# Now we can do the rest of the setup.
 import asyncio
 import os
 import nest_asyncio
 from scrapy.utils.reactor import install_reactor
 from .main import main
 
-# To ensure seamless compatibility between asynchronous libraries Twisted (used by Scrapy) and AsyncIO (used by Apify),
-# it is highly recommended to use AsyncioSelectorReactor as the Twisted reactor
+# For compatibility between Twisted (used by Scrapy) and AsyncIO (used by Apify) asynchronous libraries, it is
+# necessary to set the Twisted reactor to `AsyncioSelectorReactor`. This setup allows the two asynchronous libraries
+# to work together.
+#
+# Note: The reactor must be installed before applying `nest_asyncio.apply()`, otherwise, it will not work correctly
+# on Windows.
 install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
 nest_asyncio.apply()
 
-# Specify the path to the Scrapy project settings module
+# Specify the path to the Scrapy project settings module.
 os.environ['SCRAPY_SETTINGS_MODULE'] = '{{scrapy_settings_module}}'
 
-# Run the Apify main coroutine
+# Run the Apify main coroutine in the event loop.
 asyncio.run(main())
diff --git a/wrappers/python-scrapy/{projectFolder}/main.template.py b/wrappers/python-scrapy/{projectFolder}/main.template.py
index 73248ef8..558783f9 100644
--- a/wrappers/python-scrapy/{projectFolder}/main.template.py
+++ b/wrappers/python-scrapy/{projectFolder}/main.template.py
@@ -22,64 +22,26 @@
 from __future__ import annotations
 
 from scrapy.crawler import CrawlerProcess
-from scrapy.settings import Settings
-from scrapy.utils.project import get_project_settings
 
 from apify import Actor
+from apify.scrapy.utils import apply_apify_settings
 
-# Import your Scrapy spider here
+# Import your Scrapy spider here.
 from {{spider_module_name}} import {{spider_class_name}} as Spider
 
-# Default input values for local execution using `apify run`
+# Default input values for local execution using `apify run`.
 LOCAL_DEFAULT_START_URLS = [{'url': 'https://apify.com'}]
 
 
-def _get_scrapy_settings(proxy_cfg: dict | None = None) -> Settings:
-    """
-    Get Scrapy project settings with custom configurations.
-
-    You can add your own Scrapy components either in this function or in your `settings.py`.
-
-    Returns:
-        Scrapy project settings with custom configurations.
-    """
-    settings = get_project_settings()
-
-    # Use ApifyScheduler as the scheduler
-    settings['SCHEDULER'] = 'apify.scrapy.scheduler.ApifyScheduler'
-
-    # Add the ActorDatasetPushPipeline into the item pipelines, assigning it the highest integer (1000),
-    # ensuring it is executed as the final step in the pipeline sequence
-    settings['ITEM_PIPELINES']['apify.scrapy.pipelines.ActorDatasetPushPipeline'] = 1000
-
-    # Disable the default RobotsTxtMiddleware, Apify's custom scheduler already handles robots.txt
-    settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware'] = None
-
-    # Disable the default HttpProxyMiddleware and add ApifyHttpProxyMiddleware
-    settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware'] = None
-    settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyHttpProxyMiddleware'] = 950
-
-    # Disable the default RetryMiddleware and add ApifyRetryMiddleware with the highest integer (1000)
-    settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.retry.RetryMiddleware'] = None
-    settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyRetryMiddleware'] = 1000
-
-    # Store the proxy configuration
-    settings['APIFY_PROXY_SETTINGS'] = proxy_cfg
-
-    return settings
-
-
 async def main() -> None:
-    """
-    Apify Actor main coroutine for executing the Scrapy spider.
-    """
+    """Apify Actor main coroutine for executing the Scrapy spider."""
     async with Actor:
         Actor.log.info('Actor is being executed...')
 
-        # Process Actor input
+        # Retrieve and process Actor input.
         actor_input = await Actor.get_input() or {}
         start_urls = actor_input.get('startUrls', LOCAL_DEFAULT_START_URLS)
-        proxy_configuration = actor_input.get('proxyConfiguration')
+        proxy_config = actor_input.get('proxyConfiguration')
 
         # Open the default request queue for handling URLs to be processed.
         request_queue = await Actor.open_request_queue()
@@ -89,10 +51,10 @@ async def main() -> None:
             url = start_url.get('url')
             await request_queue.add_request(url)
 
-        # Get Scrapy project settings with custom configurations
-        settings = _get_scrapy_settings(proxy_configuration)
+        # Apply Apify settings, it will override the Scrapy project settings.
+        settings = apply_apify_settings(proxy_config=proxy_config)
 
-        # Execute the spider using Scrapy CrawlerProcess
+        # Execute the spider using Scrapy `CrawlerProcess`.
         process = CrawlerProcess(settings, install_root_handler=False)
         process.crawl(Spider)
         process.start()