Skip to content

Commit b40b91e

Browse files
authored
chore: Set UA to desktop / linux / chrome and add the ability to set custom headers in scraper (#121)
* chore: Set UA to desktop / linux / chrome and add the ability to set custom headers in scraper * chore: Add ua-generator to scraper * chore: Use custom headers to set user agent API * chore: add ua-generator to API and CLI * chore: remove unused import
1 parent 63e063a commit b40b91e

File tree

11 files changed

+94
-38
lines changed

11 files changed

+94
-38
lines changed

bases/ecoindex/backend/routers/tasks.py

Lines changed: 21 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33

44
import requests
55
from celery.result import AsyncResult
6-
import ua_generator
76
from ecoindex.backend.dependencies.validation import validate_api_key_batch
87
from ecoindex.backend.models.dependencies_parameters.id import IdParameter
98
from ecoindex.backend.utils import check_quota
@@ -17,6 +16,7 @@
1716
example_host_unreachable,
1817
)
1918
from ecoindex.models.tasks import QueueTaskApi, QueueTaskApiBatch, QueueTaskResult
19+
from ecoindex.scraper.scrap import EcoindexScraper
2020
from ecoindex.worker.tasks import ecoindex_batch_import_task, ecoindex_task
2121
from ecoindex.worker_component import app as task_app
2222
from fastapi import APIRouter, Depends, HTTPException, Response, status
@@ -49,6 +49,13 @@ async def add_ecoindex_analysis_task(
4949
example=WebPage(url="https://www.ecoindex.fr", width=1920, height=1080),
5050
),
5151
],
52+
custom_headers: Annotated[
53+
dict[str, str],
54+
Body(
55+
description="Custom headers to add to the request",
56+
example={"X-My-Custom-Header": "MyValue"},
57+
),
58+
] = {},
5259
session: AsyncSession = Depends(get_session),
5360
) -> str:
5461
if Settings().DAILY_LIMIT_PER_HOST:
@@ -68,20 +75,29 @@ async def add_ecoindex_analysis_task(
6875
detail="This host is excluded from the analysis",
6976
)
7077

78+
ua = EcoindexScraper.get_user_agent()
79+
headers = {**custom_headers, **ua.headers.get()}
80+
7181
try:
72-
ua = ua_generator.generate()
73-
r = requests.head(url=web_page.url, timeout=5, headers=ua.headers.get())
82+
r = requests.head(
83+
url=web_page.url,
84+
timeout=5,
85+
headers=headers,
86+
)
7487
r.raise_for_status()
7588
except requests.exceptions.RequestException as e:
7689
raise HTTPException(
7790
status_code=e.response.status_code
7891
if e.response
7992
else status.HTTP_400_BAD_REQUEST,
80-
detail=f"The URL {web_page.url} is unreachable. Are you really sure of this url? 🤔",
93+
detail=f"The URL {web_page.url} is unreachable. Are you really sure of this url? 🤔 ({e.response.status_code if e.response else ''})",
8194
)
8295

8396
task_result = ecoindex_task.delay( # type: ignore
84-
url=str(web_page.url), width=web_page.width, height=web_page.height
97+
url=str(web_page.url),
98+
width=web_page.width,
99+
height=web_page.height,
100+
custom_headers=headers,
85101
)
86102

87103
return task_result.id

bases/ecoindex/worker/tasks.py

Lines changed: 8 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -37,9 +37,13 @@
3737
queue="ecoindex",
3838
dont_autoretry_for=[EcoindexScraperStatusException, TypeError],
3939
)
40-
def ecoindex_task(self, url: str, width: int, height: int) -> str:
40+
def ecoindex_task(
41+
self, url: str, width: int, height: int, custom_headers: dict[str, str]
42+
) -> str:
4143
queue_task_result = run(
42-
async_ecoindex_task(self, url=url, width=width, height=height)
44+
async_ecoindex_task(
45+
self, url=url, width=width, height=height, custom_headers=custom_headers
46+
)
4347
)
4448

4549
return queue_task_result.model_dump_json()
@@ -50,6 +54,7 @@ async def async_ecoindex_task(
5054
url: str,
5155
width: int,
5256
height: int,
57+
custom_headers: dict[str, str],
5358
) -> QueueTaskResult:
5459
try:
5560
session_generator = get_session()
@@ -69,6 +74,7 @@ async def async_ecoindex_task(
6974
else None,
7075
screenshot_gid=Settings().SCREENSHOTS_GID,
7176
screenshot_uid=Settings().SCREENSHOTS_UID,
77+
custom_headers=custom_headers,
7278
).get_page_analysis()
7379

7480
db_result = await save_ecoindex_result_db(

components/ecoindex/scraper/scrap.py

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44
from time import sleep
55
from uuid import uuid4
66

7+
from ua_generator.user_agent import UserAgent
8+
from ua_generator import generate as ua_generate
9+
710
from ecoindex.compute import compute_ecoindex
811
from ecoindex.exceptions.scraper import EcoindexScraperStatusException
912
from ecoindex.models.compute import PageMetrics, Result, ScreenShot, WindowSize
@@ -28,6 +31,7 @@ def __init__(
2831
headless: bool = True,
2932
basic_auth: str | None = None,
3033
cookies: list[SetCookieParam] = [],
34+
custom_headers: dict[str, str] = {},
3135
):
3236
self.url = url
3337
self.window_size = window_size
@@ -45,6 +49,15 @@ def __init__(
4549
self.headless = headless
4650
self.basic_auth = basic_auth
4751
self.cookies = cookies
52+
self.custom_headers = custom_headers
53+
54+
@staticmethod
55+
def get_user_agent() -> UserAgent:
56+
return ua_generate(
57+
device="desktop",
58+
browser="chrome",
59+
platform="linux",
60+
)
4861

4962
@deprecated("This method is useless with new version of EcoindexScraper")
5063
def init_chromedriver(self):
@@ -86,6 +99,7 @@ async def scrap_page(self) -> PageMetrics:
8699
}
87100
if self.basic_auth
88101
else None,
102+
extra_http_headers=self.custom_headers,
89103
)
90104
await self.context.add_cookies(self.cookies)
91105
self.page = await self.context.new_page()

poetry.lock

Lines changed: 23 additions & 26 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

projects/ecoindex_api/poetry.lock

Lines changed: 1 addition & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

projects/ecoindex_api/pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -40,10 +40,10 @@ sqlmodel = "^0.0.14"
4040
sentry-sdk = "^2.8.0"
4141
setuptools = "^75.6.0"
4242
cryptography = "^44.0.2"
43+
ua-generator = "^2.0.5"
4344

4445
[tool.poetry.group.backend.dependencies]
4546
uvicorn = "^0.23.2"
46-
ua-generator = "^2.0.3"
4747

4848
[tool.poetry.group.worker.dependencies]
4949
pillow = "^10.3.0"

projects/ecoindex_cli/poetry.lock

Lines changed: 12 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

projects/ecoindex_cli/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,7 @@ pyyaml = "^6.0.1"
3535
rich = "^13.6.0"
3636
scrapy = "^2.11.0"
3737
typer = "^0.9.0"
38+
ua-generator = "^2.0.5"
3839

3940
[tool.poetry.scripts]
4041
ecoindex-cli = "ecoindex.cli.app:app"

projects/ecoindex_scraper/poetry.lock

Lines changed: 12 additions & 1 deletion
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

projects/ecoindex_scraper/pyproject.toml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,7 @@ typing-extensions = "^4.8.0"
2626
pyyaml = "^6.0.1"
2727
pillow = "^10.1.0"
2828
setuptools = ">=69.5.1,<71.0.0"
29+
ua-generator = "^2.0.5"
2930

3031
[build-system]
3132
requires = ["poetry-core>=1.0.0"]

0 commit comments

Comments
 (0)