Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Improve project bootstrapping #538

Open
wants to merge 3 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
110 changes: 87 additions & 23 deletions src/crawlee/_cli.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
# ruff: noqa: TRY301, FBT002, UP007
from __future__ import annotations

import os
import json
from pathlib import Path
from typing import Annotated, Optional, cast

Expand All @@ -16,6 +16,11 @@

cli = typer.Typer(no_args_is_help=True)

cookiecutter_json = json.load((Path().parent.parent.parent / 'templates' / 'crawler' / 'cookiecutter.json').open())
crawler_choices = cookiecutter_json['crawler_type']
package_manager_choices = cookiecutter_json['package_manager']
default_start_url = cookiecutter_json['start_url']


@cli.callback(invoke_without_command=True)
def callback(
Expand Down Expand Up @@ -64,25 +69,42 @@ def _prompt_for_project_name(initial_project_name: str | None) -> str:
return project_name


def _prompt_for_template() -> str:
"""Prompt the user to select a template from a list."""
# Fetch available templates
response = httpx.get(
TEMPLATE_LIST_URL,
timeout=httpx.Timeout(10),
headers=[('Authorization', f'Bearer {os.environ["GH_TOKEN"]}')] if 'GH_TOKEN' in os.environ else [],
def _prompt_text(message: str, default: str) -> str:
return cast(
str,
ConsoleRender().render(
inquirer.Text(
name='text',
message=message,
default=default,
validate=lambda _, value: bool(value.strip()),
),
),
)
response.raise_for_status()
template_choices = [item['name'] for item in response.json() if item['type'] == 'dir']

# Prompt for template choice

def _prompt_choice(message: str, choices: list[str]) -> str:
"""Prompt the user to pick one from a list of choices."""
return cast(
str,
ConsoleRender().render(
inquirer.List(
name='template',
message='Please select the template for your new Crawlee project',
choices=[(choice[0].upper() + choice[1:], choice) for choice in template_choices],
name='choice',
message=message,
choices=[(choice[0].upper() + choice[1:], choice) for choice in choices],
),
),
)


def _prompt_bool(message: str, *, default: bool) -> bool:
return cast(
bool,
ConsoleRender().render(
inquirer.Confirm(
name='confirm',
message=message,
default=default,
),
),
)
Expand All @@ -92,26 +114,63 @@ def _prompt_for_template() -> str:
def create(
project_name: Optional[str] = typer.Argument(
default=None,
show_default=False,
help='The name of the project and the directory that will be created to contain it. '
'If none is given, you will be prompted.',
),
crawler_type: Optional[str] = typer.Option(
None,
'--crawler-type',
'--template',
show_default=False,
help='The library that will be used for crawling in your crawler. If none is given, you will be prompted.',
),
package_manager: Optional[str] = typer.Option(
default=None,
show_default=False,
help='Package manager to be used in the new project. If none is given, you will be prompted.',
),
start_url: Optional[str] = typer.Option(
janbuchar marked this conversation as resolved.
Show resolved Hide resolved
default=None,
show_default=False,
help='The URL where crawling should start. If none is given, you will be prompted.',
),
template: Optional[str] = typer.Option(
enable_apify_integration: Optional[bool] = typer.Option(
default=None,
help='The template to be used to create the project. If none is given, you will be prompted.',
show_default=False,
help='Should Apify integration be set up for you? If not given, you will be prompted.',
),
) -> None:
"""Bootstrap a new Crawlee project."""
try:
# Prompt for project name if not provided.
project_name = _prompt_for_project_name(project_name)

# Prompt for template choice if not provided.
if template is None:
template = _prompt_for_template()

if project_name and template:
# Prompt for crawler_type if not provided.
if crawler_type is None:
crawler_type = _prompt_choice('Please select the Crawler type', crawler_choices)

# Prompt for package manager if not provided.
if package_manager is None:
package_manager = _prompt_choice('Please select the package manager', package_manager_choices)

# Prompt for start URL
if start_url is None:
start_url = _prompt_text('Please specify the start URL', default=default_start_url)

# Ask about Apify integration if not explicitly configured
if enable_apify_integration is None:
enable_apify_integration = _prompt_bool('Should Apify integration be set up for you?', default=False)

if all(
[
project_name,
crawler_type,
package_manager,
start_url,
enable_apify_integration is not None,
]
):
# Start the bootstrap process.
with Progress(
SpinnerColumn(),
Expand All @@ -121,9 +180,14 @@ def create(
progress.add_task(description='Bootstrapping...', total=None)
cookiecutter(
template='gh:apify/crawlee-python',
directory=f'templates/{template}',
directory='templates/crawler',
no_input=True,
extra_context={'project_name': project_name},
extra_context={
'project_name': project_name,
'package_manager': package_manager,
'crawler_type': crawler_type,
'enable_apify_integration': enable_apify_integration,
},
)

typer.echo(f'Your project "{project_name}" was created.')
Expand Down
12 changes: 12 additions & 0 deletions templates/crawler/cookiecutter.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
{
"project_name": "crawlee-python-beautifulsoup-project",
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is it correct?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point. I'll probably just change it to my-crawler or something.

"__package_name": "{{ cookiecutter.project_name|lower|replace('-', '_') }}",
"crawler_type": ["beautifulsoup", "parsel", "playwright"],
"package_manager": ["poetry", "pip"],
"enable_apify_integration": false,
"start_url": "https://crawlee.dev",
"_jinja2_env_vars": {
"line_statement_prefix": "# %"
},
"_extensions": ["jinja2.ext.do"]
}
32 changes: 32 additions & 0 deletions templates/crawler/templates/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# % if cookiecutter.enable_apify_integration
from apify import Actor
# % endif
# % block import required
# % endblock

from .routes import router


async def main() -> None:
"""The crawler entry point."""
# % filter truncate(0, end='')
# % block instantiation required
# % endblock
# % endfilter

# % if cookiecutter.enable_apify_integration
async with Actor:
# % filter indent(width=8, first=False)
{{ self.instantiation() }}
# % endfilter
# % else
# % filter indent(width=4, first=False)
{{ self.instantiation() }}
# % endfilter
# % endif

await crawler.run(
[
'{{ cookiecutter.start_url }}',
]
)
12 changes: 12 additions & 0 deletions templates/crawler/templates/main_beautifulsoup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# % extends 'main.py'

# % block import
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawler
# % endblock

# % block instantiation
crawler = BeautifulSoupCrawler(
request_handler=router,
max_requests_per_crawl=50,
)
# % endblock
12 changes: 12 additions & 0 deletions templates/crawler/templates/main_parsel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# % extends 'main.py'

# % block import
from crawlee.parsel_crawler import ParselCrawler
# % endblock

# % block instantiation
crawler = ParselCrawler(
request_handler=router,
max_requests_per_crawl=50,
)
# % endblock
13 changes: 13 additions & 0 deletions templates/crawler/templates/main_playwright.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
# % extends 'main.py'

# % block import
from crawlee.playwright_crawler import PlaywrightCrawler
# % endblock

# % block instantiation
crawler = PlaywrightCrawler(
request_handler=router,
headless=True,
max_requests_per_crawl=50,
)
# % endblock
19 changes: 19 additions & 0 deletions templates/crawler/templates/routes_beautifulsoup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from crawlee.beautifulsoup_crawler import BeautifulSoupCrawlingContext
from crawlee.router import Router

router = Router[BeautifulSoupCrawlingContext]()


@router.default_handler
async def default_handler(context: BeautifulSoupCrawlingContext) -> None:
"""Default request handler."""
context.log.info(f'Processing {context.request.url} ...')
title = context.soup.find('title')
await context.push_data(
{
'url': context.request.loaded_url,
'title': title.text if title else None,
}
)

await context.enqueue_links()
19 changes: 19 additions & 0 deletions templates/crawler/templates/routes_parsel.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from crawlee.parsel_crawler import ParselCrawlingContext
from crawlee.router import Router

router = Router[ParselCrawlingContext]()


@router.default_handler
async def default_handler(context: ParselCrawlingContext) -> None:
"""Default request handler."""
context.log.info(f'Processing {context.request.url} ...')
title = context.selector.xpath('//title/text()').get()
await context.push_data(
{
'url': context.request.loaded_url,
'title': title,
}
)

await context.enqueue_links()
19 changes: 19 additions & 0 deletions templates/crawler/templates/routes_playwright.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from crawlee.playwright_crawler import PlaywrightCrawlingContext
from crawlee.router import Router

router = Router[PlaywrightCrawlingContext]()


@router.default_handler
async def default_handler(context: PlaywrightCrawlingContext) -> None:
"""Default request handler."""
context.log.info(f'Processing {context.request.url} ...')
title = await context.page.query_selector('title')
await context.push_data(
{
'url': context.request.loaded_url,
'title': await title.inner_text() if title else None,
}
)

await context.enqueue_links()
62 changes: 62 additions & 0 deletions templates/crawler/{{cookiecutter.project_name}}/Dockerfile
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know this is copied, but I suggest using Hadolint and improving it according to their suggestions.

We should also pin (or at least limit to < 2) the Poetry (mostly because of python-poetry/poetry#3332).

Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
# First, specify the base Docker image.
# You can see the Docker images from Apify at https://hub.docker.com/r/apify/.
# You can also use any other image from Docker Hub.
# % if cookiecutter.crawler_type == 'playwright'
FROM apify/actor-python-playwright:3.12
# % else
FROM apify/actor-python:3.12
# % endif

RUN apt install -yq git && rm -rf /var/lib/apt/lists/*

# % if cookiecutter.package_manager == 'poetry'
RUN pip install -U pip setuptools \
&& pip install poetry \
&& poetry self add poetry-plugin-export

# Second, copy just poetry.lock and pyproject.toml into the Actor image,
# since those should be the only files that affects the dependency install in the next step,
# in order to speed up the build
COPY pyproject.toml ./
COPY poetry.lock ./

# Install the dependencies
RUN echo "Python version:" \
&& python --version \
&& echo "Installing dependencies:" \
# Export packages from poetry.lock
&& poetry export -f requirements.txt --without-hashes | \
# Replace playwright version so that it matches whatever is pre-installed in the image
sed "s/^playwright==.*/playwright==$(playwright --version | cut -d ' ' -f 2)/" | \
# Install everything using pip (ignore dependency checks - the lockfile is correct, period)
pip install -r /dev/stdin --no-dependencies \
&& echo "All installed Python packages:" \
&& pip freeze
# % elif cookiecutter.package_manager == 'pip'
RUN pip install -U pip setuptools

# Second, copy just pyproject.toml into the Actor image,
# since it should be the only file that affects the dependency install in the next step,
# in order to speed up the build
COPY pyproject.toml ./

# Install the dependencies
RUN echo "Python version:" \
&& python --version \
&& echo "Installing dependencies:" \
# Install everything using pip, set playwright version so that it matches whatever is pre-installed in the image
pip install . playwright==$(playwright --version | cut -d ' ' -f 2) \
&& echo "All installed Python packages:" \
&& pip freeze
# % endif

# Next, copy the remaining files and directories with the source code.
# Since we do this after installing the dependencies, quick build will be really fast
# for most source file changes.
COPY . ./

# Use compileall to ensure the runnability of the Actor Python code.
RUN python -m compileall -q .

# Specify how to launch the source code of your Actor.
CMD ["python", "-m", "{{ cookiecutter.__package_name }}"]
Loading
Loading