Skip to content

Commit

Permalink
chore: minor update Scrapy template and wrapper (#299)
Browse files Browse the repository at this point in the history
  • Loading branch information
vdusek authored Oct 25, 2024
1 parent a4ad19a commit 0c58069
Show file tree
Hide file tree
Showing 13 changed files with 52 additions and 86 deletions.
2 changes: 1 addition & 1 deletion templates/python-beautifulsoup/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Feel free to add your Python dependencies below. For formatting guidelines, see:
# https://pip.pypa.io/en/latest/reference/requirements-file-format/

apify == 2.0.0
apify ~= 2.0.0
beautifulsoup4[lxml]
httpx
types-beautifulsoup4
2 changes: 1 addition & 1 deletion templates/python-crawlee-beautifulsoup/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Feel free to add your Python dependencies below. For formatting guidelines, see:
# https://pip.pypa.io/en/latest/reference/requirements-file-format/

apify == 2.0.0
apify ~= 2.0.0
crawlee[beautifulsoup]
2 changes: 1 addition & 1 deletion templates/python-crawlee-playwright/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Feel free to add your Python dependencies below. For formatting guidelines, see:
# https://pip.pypa.io/en/latest/reference/requirements-file-format/

apify == 2.0.0
apify ~= 2.0.0
crawlee[playwright]
2 changes: 1 addition & 1 deletion templates/python-empty/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Feel free to add your Python dependencies below. For formatting guidelines, see:
# https://pip.pypa.io/en/latest/reference/requirements-file-format/

apify == 2.0.0
apify ~= 2.0.0
2 changes: 1 addition & 1 deletion templates/python-playwright/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Feel free to add your Python dependencies below. For formatting guidelines, see:
# https://pip.pypa.io/en/latest/reference/requirements-file-format/

apify == 2.0.0
apify ~= 2.0.0
playwright
2 changes: 1 addition & 1 deletion templates/python-scrapy/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# Feel free to add your Python dependencies below. For formatting guidelines, see:
# https://pip.pypa.io/en/latest/reference/requirements-file-format/

apify[scrapy] == 2.0.0
apify[scrapy] ~= 2.0.0
nest-asyncio
scrapy
22 changes: 12 additions & 10 deletions templates/python-scrapy/src/__main__.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,8 +36,7 @@


def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamHandler) -> None:
"""
Configure a logger with the specified settings.
"""Configure a logger with the specified settings.
Args:
logger_name: The name of the logger to be configured.
Expand Down Expand Up @@ -66,7 +65,8 @@ def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamH


def new_configure_logging(*args: Any, **kwargs: Any) -> None:
"""
"""Configure logging for Scrapy and root loggers to ensure consistent logging behavior.
We need to manually configure both the root logger and all Scrapy-associated loggers. Configuring only the root
logger is not sufficient, as Scrapy will override it with its own settings. Scrapy uses these four primary
loggers - https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77. Therefore, we configure here
Expand All @@ -91,22 +91,24 @@ def new_configure_logging(*args: Any, **kwargs: Any) -> None:

scrapy_logging.configure_logging = new_configure_logging

# Now we can do the rest of the setup
# Now we can do the rest of the setup.
import asyncio
import os
import nest_asyncio
from scrapy.utils.reactor import install_reactor
from .main import main

# To ensure seamless compatibility between asynchronous libraries Twisted (used by Scrapy) and AsyncIO (used by Apify),
# it is highly recommended to use AsyncioSelectorReactor as the Twisted reactor
# The reactor installation must be done manually before calling `nest_asyncio.apply()`,
# otherwise, it will not work correctly on Windows.
# For compatibility between Twisted (used by Scrapy) and AsyncIO (used by Apify) asynchronous libraries, it is
# necessary to set the Twisted reactor to `AsyncioSelectorReactor`. This setup allows the two asynchronous libraries
# to work together.
#
# Note: The reactor must be installed before applying `nest_asyncio.apply()`, otherwise, it will not work correctly
# on Windows.
install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
nest_asyncio.apply()

# Specify the path to the Scrapy project settings module
# Specify the path to the Scrapy project settings module.
os.environ['SCRAPY_SETTINGS_MODULE'] = 'src.settings'

# Run the Apify main coroutine
# Run the Apify main coroutine in the event loop.
asyncio.run(main())
14 changes: 6 additions & 8 deletions templates/python-scrapy/src/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,21 +26,19 @@
from apify import Actor
from apify.scrapy.utils import apply_apify_settings

# Import your Scrapy spider here
# Import your Scrapy spider here.
from .spiders.title import TitleSpider as Spider

# Default input values for local execution using `apify run`
# Default input values for local execution using `apify run`.
LOCAL_DEFAULT_START_URLS = [{'url': 'https://apify.com'}]


async def main() -> None:
"""
Apify Actor main coroutine for executing the Scrapy spider.
"""
"""Apify Actor main coroutine for executing the Scrapy spider."""
async with Actor:
Actor.log.info('Actor is being executed...')

# Process Actor input
# Retrieve and process Actor input.
actor_input = await Actor.get_input() or {}
start_urls = actor_input.get('startUrls', LOCAL_DEFAULT_START_URLS)
proxy_config = actor_input.get('proxyConfiguration')
Expand All @@ -53,10 +51,10 @@ async def main() -> None:
url = start_url.get('url')
await request_queue.add_request(url)

# Apply Apify settings, it will override the Scrapy project settings
# Apply Apify settings, it will override the Scrapy project settings.
settings = apply_apify_settings(proxy_config=proxy_config)

# Execute the spider using Scrapy CrawlerProcess
# Execute the spider using Scrapy `CrawlerProcess`.
process = CrawlerProcess(settings, install_root_handler=False)
process.crawl(Spider)
process.start()
2 changes: 1 addition & 1 deletion templates/python-selenium/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Feel free to add your Python dependencies below. For formatting guidelines, see:
# https://pip.pypa.io/en/latest/reference/requirements-file-format/

apify == 2.0.0
apify ~= 2.0.0
selenium
2 changes: 1 addition & 1 deletion templates/python-standby/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Feel free to add your Python dependencies below. For formatting guidelines, see:
# https://pip.pypa.io/en/latest/reference/requirements-file-format/

apify == 2.0.0
apify ~= 2.0.0
2 changes: 1 addition & 1 deletion templates/python-start/requirements.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# Feel free to add your Python dependencies below. For formatting guidelines, see:
# https://pip.pypa.io/en/latest/reference/requirements-file-format/

apify == 2.0.0
apify ~= 2.0.0
beautifulsoup4[lxml]
httpx
types-beautifulsoup4
28 changes: 16 additions & 12 deletions wrappers/python-scrapy/{projectFolder}/__main__.template.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,9 @@
from apify.log import ActorLogFormatter

# Define names of the loggers.
APIFY_LOGGER_NAMES = ['apify', 'apify_client']
SCRAPY_LOGGER_NAMES = ['filelock', 'hpack', 'httpx', 'scrapy', 'twisted']
ALL_LOGGER_NAMES = APIFY_LOGGER_NAMES + SCRAPY_LOGGER_NAMES
MAIN_LOGGER_NAMES = ['apify', 'apify_client', 'scrapy']
OTHER_LOGGER_NAMES = ['filelock', 'hpack', 'httpcore', 'httpx', 'protego', 'twisted']
ALL_LOGGER_NAMES = MAIN_LOGGER_NAMES + OTHER_LOGGER_NAMES

# To change the logging level, modify the `LOG_LEVEL` field in `settings.py`. If the field is not present in the file,
# Scrapy will default to `DEBUG`. This setting applies to all loggers. If you wish to change the logging level for
Expand All @@ -36,8 +36,7 @@


def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamHandler) -> None:
"""
Configure a logger with the specified settings.
"""Configure a logger with the specified settings.
Args:
logger_name: The name of the logger to be configured.
Expand All @@ -54,7 +53,7 @@ def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamH

# Apify loggers have to be set up here and in the `new_configure_logging` as well to be able to use them both from
# the `main.py` and Scrapy components.
for logger_name in APIFY_LOGGER_NAMES:
for logger_name in MAIN_LOGGER_NAMES:
configure_logger(logger_name, LOGGING_LEVEL, apify_handler)

# We can't attach our log handler to the loggers normally, because Scrapy would remove them in the `configure_logging`
Expand All @@ -66,7 +65,8 @@ def configure_logger(logger_name: str | None, log_level: str, *handlers: StreamH


def new_configure_logging(*args: Any, **kwargs: Any) -> None:
"""
"""Configure logging for Scrapy and root loggers to ensure consistent logging behavior.
We need to manually configure both the root logger and all Scrapy-associated loggers. Configuring only the root
logger is not sufficient, as Scrapy will override it with its own settings. Scrapy uses these four primary
loggers - https://github.com/scrapy/scrapy/blob/2.11.0/scrapy/utils/log.py#L60:L77. Therefore, we configure here
Expand All @@ -91,20 +91,24 @@ def new_configure_logging(*args: Any, **kwargs: Any) -> None:

scrapy_logging.configure_logging = new_configure_logging

# Now we can do the rest of the setup
# Now we can do the rest of the setup.
import asyncio
import os
import nest_asyncio
from scrapy.utils.reactor import install_reactor
from .main import main

# To ensure seamless compatibility between asynchronous libraries Twisted (used by Scrapy) and AsyncIO (used by Apify),
# it is highly recommended to use AsyncioSelectorReactor as the Twisted reactor
# For compatibility between Twisted (used by Scrapy) and AsyncIO (used by Apify) asynchronous libraries, it is
# necessary to set the Twisted reactor to `AsyncioSelectorReactor`. This setup allows the two asynchronous libraries
# to work together.
#
# Note: The reactor must be installed before applying `nest_asyncio.apply()`, otherwise, it will not work correctly
# on Windows.
install_reactor('twisted.internet.asyncioreactor.AsyncioSelectorReactor')
nest_asyncio.apply()

# Specify the path to the Scrapy project settings module
# Specify the path to the Scrapy project settings module.
os.environ['SCRAPY_SETTINGS_MODULE'] = '{{scrapy_settings_module}}'

# Run the Apify main coroutine
# Run the Apify main coroutine in the event loop.
asyncio.run(main())
56 changes: 9 additions & 47 deletions wrappers/python-scrapy/{projectFolder}/main.template.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,64 +22,26 @@
from __future__ import annotations

from scrapy.crawler import CrawlerProcess
from scrapy.settings import Settings
from scrapy.utils.project import get_project_settings

from apify import Actor
from apify.scrapy.utils import apply_apify_settings

# Import your Scrapy spider here
# Import your Scrapy spider here.
from {{spider_module_name}} import {{spider_class_name}} as Spider

# Default input values for local execution using `apify run`
# Default input values for local execution using `apify run`.
LOCAL_DEFAULT_START_URLS = [{'url': 'https://apify.com'}]


def _get_scrapy_settings(proxy_cfg: dict | None = None) -> Settings:
"""
Get Scrapy project settings with custom configurations.
You can add your own Scrapy components either in this function or in your `settings.py`.
Returns:
Scrapy project settings with custom configurations.
"""
settings = get_project_settings()

# Use ApifyScheduler as the scheduler
settings['SCHEDULER'] = 'apify.scrapy.scheduler.ApifyScheduler'

# Add the ActorDatasetPushPipeline into the item pipelines, assigning it the highest integer (1000),
# ensuring it is executed as the final step in the pipeline sequence
settings['ITEM_PIPELINES']['apify.scrapy.pipelines.ActorDatasetPushPipeline'] = 1000

# Disable the default RobotsTxtMiddleware, Apify's custom scheduler already handles robots.txt
settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.robotstxt.RobotsTxtMiddleware'] = None

# Disable the default HttpProxyMiddleware and add ApifyHttpProxyMiddleware
settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware'] = None
settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyHttpProxyMiddleware'] = 950

# Disable the default RetryMiddleware and add ApifyRetryMiddleware with the highest integer (1000)
settings['DOWNLOADER_MIDDLEWARES']['scrapy.downloadermiddlewares.retry.RetryMiddleware'] = None
settings['DOWNLOADER_MIDDLEWARES']['apify.scrapy.middlewares.ApifyRetryMiddleware'] = 1000

# Store the proxy configuration
settings['APIFY_PROXY_SETTINGS'] = proxy_cfg

return settings


async def main() -> None:
"""
Apify Actor main coroutine for executing the Scrapy spider.
"""
"""Apify Actor main coroutine for executing the Scrapy spider."""
async with Actor:
Actor.log.info('Actor is being executed...')

# Process Actor input
# Retrieve and process Actor input.
actor_input = await Actor.get_input() or {}
start_urls = actor_input.get('startUrls', LOCAL_DEFAULT_START_URLS)
proxy_configuration = actor_input.get('proxyConfiguration')
proxy_config = actor_input.get('proxyConfiguration')

# Open the default request queue for handling URLs to be processed.
request_queue = await Actor.open_request_queue()
Expand All @@ -89,10 +51,10 @@ async def main() -> None:
url = start_url.get('url')
await request_queue.add_request(url)

# Get Scrapy project settings with custom configurations
settings = _get_scrapy_settings(proxy_configuration)
# Apply Apify settings, it will override the Scrapy project settings.
settings = apply_apify_settings(proxy_config=proxy_config)

# Execute the spider using Scrapy CrawlerProcess
# Execute the spider using Scrapy `CrawlerProcess`.
process = CrawlerProcess(settings, install_root_handler=False)
process.crawl(Spider)
process.start()

0 comments on commit 0c58069

Please sign in to comment.