Skip to content

Commit

Permalink
refact(scraper):Create a bulk analyzis helper for scraper and refacto…
Browse files Browse the repository at this point in the history
…r cli (#86)
  • Loading branch information
vvatelot authored Jul 12, 2024
1 parent e53ed87 commit 9f71f8d
Show file tree
Hide file tree
Showing 5 changed files with 115 additions and 84 deletions.
65 changes: 23 additions & 42 deletions bases/ecoindex/cli/app.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from concurrent.futures import ThreadPoolExecutor, as_completed
from asyncio import run
from datetime import datetime
from multiprocessing import cpu_count
from os.path import dirname
Expand All @@ -16,9 +16,9 @@
get_window_sizes_from_args,
)
from ecoindex.cli.console_output import display_result_synthesis
from ecoindex.cli.helper import run_page_analysis
from ecoindex.cli.report import Report
from ecoindex.models import ExportFormat, Language
from ecoindex.scraper.helper import bulk_analysis
from ecoindex.utils.files import write_results_to_file, write_urls_to_file
from loguru import logger
from rich.progress import (
Expand Down Expand Up @@ -165,7 +165,9 @@ def analyze(
urls=urls, urls_file=urls_file, tmp_folder=tmp_folder
)
elif sitemap:
secho(f"⏲️ Crawling sitemap url {sitemap} -> Wait a minute!", fg=colors.MAGENTA)
secho(
f"⏲️ Crawling sitemap url {sitemap} -> Wait a minute!", fg=colors.MAGENTA
)
urls = get_urls_from_sitemap(main_url=sitemap)
(
file_prefix,
Expand Down Expand Up @@ -220,47 +222,26 @@ def analyze(
TextColumn("•"),
TimeRemainingColumn(),
) as progress:
count_errors = 0
task = progress.add_task("Processing", total=len(urls) * len(window_sizes))

with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_analysis = {}

for url in urls:
for window_size in window_sizes:
future_to_analysis[
executor.submit(
run_page_analysis,
url,
window_size,
wait_after_scroll,
wait_before_scroll,
logger,
)
] = (
url,
window_size,
wait_after_scroll,
wait_before_scroll,
logger,
)
count_errors = 0

for future in as_completed(future_to_analysis):
try:
result, success = future.result()

if not success:
count_errors += 1

else:
results.append(result)

except Exception as e:
count_errors += 1
url, _, _, _, _ = future_to_analysis[future]
logger.error(f"{url} -- {e.msg if hasattr(e, 'msg') else e}")

progress.update(task, advance=1)
analysis_results = run(
bulk_analysis(
max_workers=max_workers,
urls=urls,
window_sizes=window_sizes,
wait_after_scroll=wait_after_scroll,
wait_before_scroll=wait_before_scroll,
logger=logger,
)
)

for result, success in analysis_results:
results.append(result)
if not success:
count_errors += 1

progress.update(task, advance=1)

if count_errors > 0:
secho(
Expand Down
43 changes: 3 additions & 40 deletions bases/ecoindex/cli/helper.py
Original file line number Diff line number Diff line change
@@ -1,51 +1,14 @@
from asyncio import run
from ecoindex.config import Settings

from ecoindex.models import Result, WindowSize, CliHost
from ecoindex.scraper import EcoindexScraper


def run_page_analysis(
url: str,
window_size: WindowSize,
wait_after_scroll: int = 3,
wait_before_scroll: int = 3,
logger=None,
) -> tuple[Result, bool]:
"""Run the page analysis and return the result and a boolean indicating if the analysis was successful"""
scraper = EcoindexScraper(
url=str(url),
window_size=window_size,
wait_after_scroll=wait_after_scroll,
wait_before_scroll=wait_before_scroll,
page_load_timeout=20,
)
try:
return (run(scraper.get_page_analysis()), True)
except Exception as e:
logger.error(f"{url} -- {e.msg if hasattr(e, 'msg') else e}")

return (
Result(
url=url,
water=0,
width=window_size.width,
height=window_size.height,
size=0,
nodes=0,
requests=0,
),
False,
)
from ecoindex.models import CliHost


def replace_localhost_with_hostdocker(netloc: str) -> CliHost:
if Settings().DOCKER_CONTAINER and "localhost" in netloc:
domain = "host.docker.internal"
netloc = netloc.replace("localhost", domain)
elif "localhost" in netloc :
elif "localhost" in netloc:
domain = "localhost"
else :
else:
domain = netloc

return CliHost(domain=domain, netloc=netloc)
74 changes: 74 additions & 0 deletions components/ecoindex/scraper/helper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
from asyncio import run
from concurrent.futures import ThreadPoolExecutor, as_completed
from typing import AsyncGenerator

from ecoindex.models.compute import Result, WindowSize
from ecoindex.scraper.scrap import EcoindexScraper


def run_page_analysis(
url: str,
window_size: WindowSize,
wait_after_scroll: int = 3,
wait_before_scroll: int = 3,
logger=None,
) -> tuple[Result, bool]:
"""Run the page analysis and return the result and a boolean indicating if the analysis was successful"""
scraper = EcoindexScraper(
url=str(url),
window_size=window_size,
wait_after_scroll=wait_after_scroll,
wait_before_scroll=wait_before_scroll,
page_load_timeout=20,
)
try:
return (run(scraper.get_page_analysis()), True)
except Exception as e:
logger.error(f"{url} -- {e.msg if hasattr(e, 'msg') else e}")

return (
Result(
url=url,
water=0,
width=window_size.width,
height=window_size.height,
size=0,
nodes=0,
requests=0,
),
False,
)


async def bulk_analysis(
max_workers,
urls,
window_sizes,
wait_after_scroll: int = 0,
wait_before_scroll: int = 0,
logger=None,
) -> AsyncGenerator[tuple[Result, bool], None]:
with ThreadPoolExecutor(max_workers=max_workers) as executor:
future_to_analysis = {}

for url in urls:
for window_size in window_sizes:
future_to_analysis[
executor.submit(
run_page_analysis,
url,
window_size,
wait_after_scroll,
wait_before_scroll,
logger,
)
] = (
url,
window_size,
wait_after_scroll,
wait_before_scroll,
logger,
)

for future in as_completed(future_to_analysis):
yield future.result()
4 changes: 3 additions & 1 deletion components/ecoindex/scraper/scrap.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ def __init__(
screenshot_uid: int | None = None,
screenshot_gid: int | None = None,
page_load_timeout: int = 20,
headless: bool = True,
):
self.url = url
self.window_size = window_size
Expand All @@ -39,6 +40,7 @@ def __init__(
self.har_temp_file_path = (
f"/tmp/ecoindex-{self.now.strftime('%Y-%m-%d-%H-%M-%S-%f')}-{uuid4()}.har"
)
self.headless = headless

@deprecated("This method is useless with new version of EcoindexScraper")
def init_chromedriver(self):
Expand All @@ -64,7 +66,7 @@ async def get_requests_by_category(self) -> MimetypeAggregation:

async def scrap_page(self) -> PageMetrics:
async with async_playwright() as p:
browser = await p.chromium.launch()
browser = await p.chromium.launch(headless=self.headless)
self.page = await browser.new_page(
record_har_path=self.har_temp_file_path,
screen=self.window_size.model_dump(),
Expand Down
13 changes: 12 additions & 1 deletion development/ecoindex_scraper.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,17 @@
import asyncio
from pprint import pprint
from uuid import uuid1

from ecoindex.models.compute import ScreenShot
from ecoindex.scraper import EcoindexScraper

pprint(asyncio.run(EcoindexScraper(url="http://ecoindex.fr").get_page_analysis()))
scraper = EcoindexScraper(
url="https://www.kiabi.com",
screenshot=ScreenShot(id=str(uuid1()), folder="./screenshots"),
)

result = asyncio.run(scraper.get_page_analysis())
all_requests = asyncio.run(scraper.get_all_requests())
requests_by_category = asyncio.run(scraper.get_requests_by_category())

pprint(result)

1 comment on commit 9f71f8d

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Coverage PR

Branch coverage •
FileStmtsMissCoverMissing
bases/ecoindex/cli
   app.py954354%139–142, 144, 159–160, 172, 205–206, 208, 216, 225–226, 228, 239–242, 244, 246–247, 252, 256–257, 259, 261, 264, 266–268, 270–271, 274–276, 284, 288, 320, 322, 330, 334, 338
   helper.py100100% 
components/ecoindex/scraper
   helper.py201335%17, 24–27, 29, 51–52, 54–56, 73–74
   scrap.py925144%47, 50–51, 53, 62, 65, 68–70, 75–77, 79–83, 86–89, 91, 93, 100–103, 110–112, 114–123, 132–133, 136–137, 139, 148–149, 154–155, 159–160
TOTAL69822767% 

Please sign in to comment.