diff --git a/src/crawlee/_cli.py b/src/crawlee/_cli.py index 2e2daf431..de82b8d3c 100644 --- a/src/crawlee/_cli.py +++ b/src/crawlee/_cli.py @@ -1,7 +1,7 @@ # ruff: noqa: TRY301, FBT002, UP007 from __future__ import annotations -import os +import json from pathlib import Path from typing import Annotated, Optional, cast @@ -16,6 +16,11 @@ cli = typer.Typer(no_args_is_help=True) +cookiecutter_json = json.load((Path().parent.parent.parent / 'templates' / 'crawler' / 'cookiecutter.json').open()) +crawler_choices = cookiecutter_json['crawler_type'] +package_manager_choices = cookiecutter_json['package_manager'] +default_start_url = cookiecutter_json['start_url'] + @cli.callback(invoke_without_command=True) def callback( @@ -64,25 +69,42 @@ def _prompt_for_project_name(initial_project_name: str | None) -> str: return project_name -def _prompt_for_template() -> str: - """Prompt the user to select a template from a list.""" - # Fetch available templates - response = httpx.get( - TEMPLATE_LIST_URL, - timeout=httpx.Timeout(10), - headers=[('Authorization', f'Bearer {os.environ["GH_TOKEN"]}')] if 'GH_TOKEN' in os.environ else [], +def _prompt_text(message: str, default: str) -> str: + return cast( + str, + ConsoleRender().render( + inquirer.Text( + name='text', + message=message, + default=default, + validate=lambda _, value: bool(value.strip()), + ), + ), ) - response.raise_for_status() - template_choices = [item['name'] for item in response.json() if item['type'] == 'dir'] - # Prompt for template choice + +def _prompt_choice(message: str, choices: list[str]) -> str: + """Prompt the user to pick one from a list of choices.""" return cast( str, ConsoleRender().render( inquirer.List( - name='template', - message='Please select the template for your new Crawlee project', - choices=[(choice[0].upper() + choice[1:], choice) for choice in template_choices], + name='choice', + message=message, + choices=[(choice[0].upper() + choice[1:], choice) for choice in choices], + ), + ), + ) + + +def _prompt_bool(message: str, *, default: bool) -> bool: + return cast( + bool, + ConsoleRender().render( + inquirer.Confirm( + name='confirm', + message=message, + default=default, ), ), ) @@ -92,14 +114,32 @@ def _prompt_for_template() -> str: def create( project_name: Optional[str] = typer.Argument( default=None, + show_default=False, help='The name of the project and the directory that will be created to contain it. ' 'If none is given, you will be prompted.', + ), + crawler_type: Optional[str] = typer.Option( + None, + '--crawler-type', + '--template', show_default=False, + help='The library that will be used for crawling in your crawler. If none is given, you will be prompted.', ), - template: Optional[str] = typer.Option( + package_manager: Optional[str] = typer.Option( default=None, - help='The template to be used to create the project. If none is given, you will be prompted.', show_default=False, + help='Package manager to be used in the new project. If none is given, you will be prompted.', + ), + start_url: Optional[str] = typer.Option( + default=None, + show_default=False, + help='The URL where crawling should start. If none is given, you will be prompted.', + ), + enable_apify_integration: Optional[bool] = typer.Option( + None, + '--apify/--no-apify', + show_default=False, + help='Should Apify integration be set up for you? If not given, you will be prompted.', ), ) -> None: """Bootstrap a new Crawlee project.""" @@ -107,11 +147,33 @@ def create( # Prompt for project name if not provided. project_name = _prompt_for_project_name(project_name) - # Prompt for template choice if not provided. - if template is None: - template = _prompt_for_template() + # Prompt for crawler_type if not provided. + if crawler_type is None: + crawler_type = _prompt_choice('Please select the Crawler type', crawler_choices) + + # Prompt for package manager if not provided. + if package_manager is None: + package_manager = _prompt_choice('Please select the package manager', package_manager_choices) + + # Prompt for start URL + if start_url is None: + start_url = _prompt_text('Please specify the start URL', default=default_start_url) + + # Ask about Apify integration if not explicitly configured + if enable_apify_integration is None: + enable_apify_integration = _prompt_bool('Should Apify integration be set up for you?', default=False) + + if all( + [ + project_name, + crawler_type, + package_manager, + start_url, + enable_apify_integration is not None, + ] + ): + package_name = project_name.replace('-', '_') - if project_name and template: # Start the bootstrap process. with Progress( SpinnerColumn(), @@ -121,16 +183,23 @@ def create( progress.add_task(description='Bootstrapping...', total=None) cookiecutter( template='gh:apify/crawlee-python', - directory=f'templates/{template}', + directory='templates/crawler', no_input=True, - extra_context={'project_name': project_name}, + extra_context={ + 'project_name': project_name, + 'package_manager': package_manager, + 'crawler_type': crawler_type, + 'enable_apify_integration': enable_apify_integration, + 'start_url': start_url, + }, ) typer.echo(f'Your project "{project_name}" was created.') typer.echo( f'To run it, navigate to the directory: "cd {project_name}", ' - 'install dependencies with "poetry install", ' - f'and run it using "poetry run python -m {project_name}".' + f'and run it using "poetry run python -m {package_name}".' + if package_manager == 'poetry' + else f'and run it using "python -m {package_name}".' ) typer.echo(f'See the "{project_name}/README.md" for more information.') diff --git a/templates/crawler/cookiecutter.json b/templates/crawler/cookiecutter.json new file mode 100644 index 000000000..33f54fff7 --- /dev/null +++ b/templates/crawler/cookiecutter.json @@ -0,0 +1,12 @@ +{ + "project_name": "crawlee-python-beautifulsoup-project", + "__package_name": "{{ cookiecutter.project_name|lower|replace('-', '_') }}", + "crawler_type": ["beautifulsoup", "parsel", "playwright"], + "package_manager": ["poetry", "pip"], + "enable_apify_integration": false, + "start_url": "https://crawlee.dev", + "_jinja2_env_vars": { + "line_statement_prefix": "# %" + }, + "_extensions": ["jinja2.ext.do"] +} diff --git a/templates/crawler/hooks/post_gen_project.py b/templates/crawler/hooks/post_gen_project.py new file mode 100644 index 000000000..02a5c18d8 --- /dev/null +++ b/templates/crawler/hooks/post_gen_project.py @@ -0,0 +1,13 @@ +import subprocess + +# % if cookiecutter.package_manager == 'poetry' +subprocess.check_call(['poetry', 'install']) +# % if cookiecutter.crawler_type == 'playwright' +subprocess.check_call(['poetry', 'run', 'playwright', 'install']) +# % endif +# % elif cookiecutter.package_manager == 'pip' +subprocess.check_call(['pip', 'install', '.']) +# % if cookiecutter.crawler_type == 'playwright' +subprocess.check_call(['playwright', 'install']) +# % endif +# % endif diff --git a/templates/crawler/templates/main.py b/templates/crawler/templates/main.py new file mode 100644 index 000000000..a8d4ac36b --- /dev/null +++ b/templates/crawler/templates/main.py @@ -0,0 +1,32 @@ +# % if cookiecutter.enable_apify_integration +from apify import Actor +# % endif +# % block import required +# % endblock + +from .routes import router + + +async def main() -> None: + """The crawler entry point.""" + # % filter truncate(0, end='') + # % block instantiation required + # % endblock + # % endfilter + + # % if cookiecutter.enable_apify_integration + async with Actor: + # % filter indent(width=8, first=False) + {{ self.instantiation() }} + # % endfilter + # % else + # % filter indent(width=4, first=False) + {{ self.instantiation() }} + # % endfilter + # % endif + + await crawler.run( + [ + '{{ cookiecutter.start_url }}', + ] + ) diff --git a/templates/crawler/templates/main_beautifulsoup.py b/templates/crawler/templates/main_beautifulsoup.py new file mode 100644 index 000000000..d4a328091 --- /dev/null +++ b/templates/crawler/templates/main_beautifulsoup.py @@ -0,0 +1,12 @@ +# % extends 'main.py' + +# % block import +from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler +# % endblock + +# % block instantiation +crawler = BeautifulSoupCrawler( + request_handler=router, + max_requests_per_crawl=50, +) +# % endblock diff --git a/templates/crawler/templates/main_parsel.py b/templates/crawler/templates/main_parsel.py new file mode 100644 index 000000000..d503636f5 --- /dev/null +++ b/templates/crawler/templates/main_parsel.py @@ -0,0 +1,12 @@ +# % extends 'main.py' + +# % block import +from crawlee.parsel_crawler import ParselCrawler +# % endblock + +# % block instantiation +crawler = ParselCrawler( + request_handler=router, + max_requests_per_crawl=50, +) +# % endblock diff --git a/templates/crawler/templates/main_playwright.py b/templates/crawler/templates/main_playwright.py new file mode 100644 index 000000000..093512f8a --- /dev/null +++ b/templates/crawler/templates/main_playwright.py @@ -0,0 +1,13 @@ +# % extends 'main.py' + +# % block import +from crawlee.playwright_crawler import PlaywrightCrawler +# % endblock + +# % block instantiation +crawler = PlaywrightCrawler( + request_handler=router, + headless=True, + max_requests_per_crawl=50, +) +# % endblock diff --git a/templates/crawler/templates/routes_beautifulsoup.py b/templates/crawler/templates/routes_beautifulsoup.py new file mode 100644 index 000000000..4b8715a35 --- /dev/null +++ b/templates/crawler/templates/routes_beautifulsoup.py @@ -0,0 +1,19 @@ +from crawlee.beautifulsoup_crawler import BeautifulSoupCrawlingContext +from crawlee.router import Router + +router = Router[BeautifulSoupCrawlingContext]() + + +@router.default_handler +async def default_handler(context: BeautifulSoupCrawlingContext) -> None: + """Default request handler.""" + context.log.info(f'Processing {context.request.url} ...') + title = context.soup.find('title') + await context.push_data( + { + 'url': context.request.loaded_url, + 'title': title.text if title else None, + } + ) + + await context.enqueue_links() diff --git a/templates/crawler/templates/routes_parsel.py b/templates/crawler/templates/routes_parsel.py new file mode 100644 index 000000000..b5c3e8118 --- /dev/null +++ b/templates/crawler/templates/routes_parsel.py @@ -0,0 +1,19 @@ +from crawlee.parsel_crawler import ParselCrawlingContext +from crawlee.router import Router + +router = Router[ParselCrawlingContext]() + + +@router.default_handler +async def default_handler(context: ParselCrawlingContext) -> None: + """Default request handler.""" + context.log.info(f'Processing {context.request.url} ...') + title = context.selector.xpath('//title/text()').get() + await context.push_data( + { + 'url': context.request.loaded_url, + 'title': title, + } + ) + + await context.enqueue_links() diff --git a/templates/crawler/templates/routes_playwright.py b/templates/crawler/templates/routes_playwright.py new file mode 100644 index 000000000..47aa207cc --- /dev/null +++ b/templates/crawler/templates/routes_playwright.py @@ -0,0 +1,19 @@ +from crawlee.playwright_crawler import PlaywrightCrawlingContext +from crawlee.router import Router + +router = Router[PlaywrightCrawlingContext]() + + +@router.default_handler +async def default_handler(context: PlaywrightCrawlingContext) -> None: + """Default request handler.""" + context.log.info(f'Processing {context.request.url} ...') + title = await context.page.query_selector('title') + await context.push_data( + { + 'url': context.request.loaded_url, + 'title': await title.inner_text() if title else None, + } + ) + + await context.enqueue_links() diff --git a/templates/crawler/{{cookiecutter.project_name}}/Dockerfile b/templates/crawler/{{cookiecutter.project_name}}/Dockerfile new file mode 100644 index 000000000..dabdc3369 --- /dev/null +++ b/templates/crawler/{{cookiecutter.project_name}}/Dockerfile @@ -0,0 +1,62 @@ +# First, specify the base Docker image. +# You can see the Docker images from Apify at https://hub.docker.com/r/apify/. +# You can also use any other image from Docker Hub. +# % if cookiecutter.crawler_type == 'playwright' +FROM apify/actor-python-playwright:3.12 +# % else +FROM apify/actor-python:3.12 +# % endif + +RUN apt install -yq git && rm -rf /var/lib/apt/lists/* + +# % if cookiecutter.package_manager == 'poetry' +RUN pip install -U pip setuptools \ + && pip install poetry \ + && poetry self add poetry-plugin-export + +# Second, copy just poetry.lock and pyproject.toml into the Actor image, +# since those should be the only files that affects the dependency install in the next step, +# in order to speed up the build +COPY pyproject.toml ./ +COPY poetry.lock ./ + +# Install the dependencies +RUN echo "Python version:" \ + && python --version \ + && echo "Installing dependencies:" \ + # Export packages from poetry.lock + && poetry export -f requirements.txt --without-hashes | \ + # Replace playwright version so that it matches whatever is pre-installed in the image + sed "s/^playwright==.*/playwright==$(playwright --version | cut -d ' ' -f 2)/" | \ + # Install everything using pip (ignore dependency checks - the lockfile is correct, period) + pip install -r /dev/stdin --no-dependencies \ + && echo "All installed Python packages:" \ + && pip freeze +# % elif cookiecutter.package_manager == 'pip' +RUN pip install -U pip setuptools + +# Second, copy just pyproject.toml into the Actor image, +# since it should be the only file that affects the dependency install in the next step, +# in order to speed up the build +COPY pyproject.toml ./ + +# Install the dependencies +RUN echo "Python version:" \ + && python --version \ + && echo "Installing dependencies:" \ + # Install everything using pip, set playwright version so that it matches whatever is pre-installed in the image + pip install . playwright==$(playwright --version | cut -d ' ' -f 2) \ + && echo "All installed Python packages:" \ + && pip freeze +# % endif + +# Next, copy the remaining files and directories with the source code. +# Since we do this after installing the dependencies, quick build will be really fast +# for most source file changes. +COPY . ./ + +# Use compileall to ensure the runnability of the Actor Python code. +RUN python -m compileall -q . + +# Specify how to launch the source code of your Actor. +CMD ["python", "-m", "{{ cookiecutter.__package_name }}"] diff --git a/templates/crawler/{{cookiecutter.project_name}}/README.md b/templates/crawler/{{cookiecutter.project_name}}/README.md new file mode 100644 index 000000000..444d8c44a --- /dev/null +++ b/templates/crawler/{{cookiecutter.project_name}}/README.md @@ -0,0 +1,37 @@ +# {{cookiecutter.project_name}} + +Project skeleton generated by Crawlee (Beautifulsoup template). + +## Usage + +{% if cookiecutter.package_manager == 'poetry' %} +To get started, ensure you have [Poetry](https://python-poetry.org/), a package and dependency management system, installed on your machine. We recommend installing it with the following command: + +```sh +pipx install poetry +``` + +Next, install the project dependencies: + +```sh +poetry install +``` + +Finally, launch the crawler with: + +```sh +poetry run python -m {{cookiecutter.__package_name}} +``` +{% elif cookiecutter.package_manager == 'pip' %} +To install dependencies, your can run the following command: + +```sh +pip install . +``` + +When the dependencies are installed, you may launch the crawler with: + +```sh +python -m {{cookiecutter.__package_name}} +``` +{% endif %} diff --git a/templates/crawler/{{cookiecutter.project_name}}/pyproject.toml b/templates/crawler/{{cookiecutter.project_name}}/pyproject.toml new file mode 100644 index 000000000..f73a8565c --- /dev/null +++ b/templates/crawler/{{cookiecutter.project_name}}/pyproject.toml @@ -0,0 +1,34 @@ +# % set extras = [cookiecutter.crawler_type] +# % if cookiecutter.enable_apify_integration +# % do extras.append('apify') +# % endif + +# % if cookiecutter.package_manager == 'poetry' +[tool.poetry] +name = "{{cookiecutter.project_name}}" +version = "0.0.1" +description = "" +authors = ["Your Name "] +readme = "README.md" +package-mode = false + +[tool.poetry.dependencies] +python = "^3.9" +crawlee = {version = "*", extras = {{ extras | tojson}}} + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" +# % else +[project] +name = "{{cookiecutter.project_name}}" +version = "0.0.1" +description = "" +authors = ["Your Name "] +readme = "README.md" +requires-python = ">=3.9" + +dependencies = [ + "crawlee[{{extras | join(',')}}]" +] +# % endif diff --git a/templates/crawler/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__init__.py b/templates/crawler/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/templates/crawler/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__main__.py b/templates/crawler/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__main__.py new file mode 100644 index 000000000..8c4ab0b86 --- /dev/null +++ b/templates/crawler/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/__main__.py @@ -0,0 +1,6 @@ +import asyncio + +from .main import main + +if __name__ == '__main__': + asyncio.run(main()) diff --git a/templates/crawler/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/main.py b/templates/crawler/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/main.py new file mode 100644 index 000000000..d6591ab12 --- /dev/null +++ b/templates/crawler/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/main.py @@ -0,0 +1 @@ +# % include 'main_%s.py' % cookiecutter.crawler_type diff --git a/templates/crawler/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/routes.py b/templates/crawler/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/routes.py new file mode 100644 index 000000000..dfab2f1bb --- /dev/null +++ b/templates/crawler/{{cookiecutter.project_name}}/{{cookiecutter.__package_name}}/routes.py @@ -0,0 +1 @@ +# % include 'routes_%s.py' % cookiecutter.crawler_type diff --git a/tests/unit/test_cli.py b/tests/unit/test_cli.py index 787feb28a..32e52f2c7 100644 --- a/tests/unit/test_cli.py +++ b/tests/unit/test_cli.py @@ -26,6 +26,9 @@ def test_create_interactive(mock_cookiecutter: Mock, monkeypatch: pytest.MonkeyP *'my_project', readchar.key.ENTER, readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, ] ) monkeypatch.setattr(target=readchar, name='readkey', value=lambda: next(mock_input)) @@ -35,9 +38,15 @@ def test_create_interactive(mock_cookiecutter: Mock, monkeypatch: pytest.MonkeyP mock_cookiecutter.assert_called_with( template='gh:apify/crawlee-python', - directory='templates/beautifulsoup', + directory='templates/crawler', no_input=True, - extra_context={'project_name': 'my_project'}, + extra_context={ + 'project_name': 'my_project', + 'package_manager': 'poetry', + 'crawler_type': 'beautifulsoup', + 'enable_apify_integration': False, + 'start_url': 'https://crawlee.dev', + }, ) @@ -48,6 +57,9 @@ def test_create_interactive_non_default_template(mock_cookiecutter: Mock, monkey readchar.key.ENTER, readchar.key.DOWN, readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, ] ) monkeypatch.setattr(target=readchar, name='readkey', value=lambda: next(mock_input)) @@ -57,20 +69,45 @@ def test_create_interactive_non_default_template(mock_cookiecutter: Mock, monkey mock_cookiecutter.assert_called_with( template='gh:apify/crawlee-python', - directory='templates/playwright', + directory='templates/crawler', no_input=True, - extra_context={'project_name': 'my_project'}, + extra_context={ + 'project_name': 'my_project', + 'package_manager': 'poetry', + 'crawler_type': 'parsel', + 'enable_apify_integration': False, + 'start_url': 'https://crawlee.dev', + }, ) def test_create_non_interactive(mock_cookiecutter: Mock) -> None: - runner.invoke(crawlee._cli.cli, ['create', 'my_project', '--template', 'playwright']) + runner.invoke( + crawlee._cli.cli, + [ + 'create', + 'my_project', + '--crawler-type', + 'playwright', + '--package-manager', + 'pip', + '--start-url', + 'https://yr.no', + '--no-apify', + ], + ) mock_cookiecutter.assert_called_with( template='gh:apify/crawlee-python', - directory='templates/playwright', + directory='templates/crawler', no_input=True, - extra_context={'project_name': 'my_project'}, + extra_context={ + 'project_name': 'my_project', + 'package_manager': 'pip', + 'crawler_type': 'playwright', + 'start_url': 'https://yr.no', + 'enable_apify_integration': False, + }, ) @@ -89,14 +126,33 @@ def test_create_existing_folder( os.chdir(tmp) (tmp / 'existing_project').mkdir() - result = runner.invoke(crawlee._cli.cli, ['create', 'existing_project', '--template', 'playwright']) + result = runner.invoke( + crawlee._cli.cli, + [ + 'create', + 'existing_project', + '--crawler-type', + 'playwright', + '--package-manager', + 'pip', + '--start-url', + 'https://yr.no', + '--no-apify', + ], + ) assert 'existing_project already exists' in result.output mock_cookiecutter.assert_called_with( template='gh:apify/crawlee-python', - directory='templates/playwright', + directory='templates/crawler', no_input=True, - extra_context={'project_name': 'my_project'}, + extra_context={ + 'project_name': 'my_project', + 'package_manager': 'pip', + 'crawler_type': 'playwright', + 'start_url': 'https://yr.no', + 'enable_apify_integration': False, + }, ) @@ -109,6 +165,9 @@ def test_create_existing_folder_interactive( readchar.key.ENTER, *'my_project', readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, ] ) monkeypatch.setattr(target=readchar, name='readkey', value=lambda: next(mock_input)) @@ -122,9 +181,15 @@ def test_create_existing_folder_interactive( mock_cookiecutter.assert_called_with( template='gh:apify/crawlee-python', - directory='templates/playwright', + directory='templates/crawler', no_input=True, - extra_context={'project_name': 'my_project'}, + extra_context={ + 'project_name': 'my_project', + 'package_manager': 'poetry', + 'crawler_type': 'playwright', + 'start_url': 'https://crawlee.dev', + 'enable_apify_integration': False, + }, ) @@ -139,6 +204,9 @@ def test_create_existing_folder_interactive_multiple_attempts( readchar.key.ENTER, *'my_project', readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, + readchar.key.ENTER, ] ) monkeypatch.setattr(target=readchar, name='readkey', value=lambda: next(mock_input)) @@ -148,12 +216,18 @@ def test_create_existing_folder_interactive_multiple_attempts( (tmp / 'existing_project').mkdir() (tmp / 'existing_project_2').mkdir() - result = runner.invoke(crawlee._cli.cli, ['create', '--template', 'playwright']) + result = runner.invoke(crawlee._cli.cli, ['create', '--crawler-type', 'playwright']) assert 'existing_project already exists' in result.output mock_cookiecutter.assert_called_with( template='gh:apify/crawlee-python', - directory='templates/playwright', + directory='templates/crawler', no_input=True, - extra_context={'project_name': 'my_project'}, + extra_context={ + 'project_name': 'my_project', + 'package_manager': 'poetry', + 'crawler_type': 'playwright', + 'start_url': 'https://crawlee.dev', + 'enable_apify_integration': False, + }, )