-
Notifications
You must be signed in to change notification settings - Fork 4
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
refact(scraper):Create a bulk analyzis helper for scraper and refacto…
…r cli (#86)
- Loading branch information
Showing
5 changed files
with
115 additions
and
84 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,51 +1,14 @@ | ||
from asyncio import run | ||
from ecoindex.config import Settings | ||
|
||
from ecoindex.models import Result, WindowSize, CliHost | ||
from ecoindex.scraper import EcoindexScraper | ||
|
||
|
||
def run_page_analysis( | ||
url: str, | ||
window_size: WindowSize, | ||
wait_after_scroll: int = 3, | ||
wait_before_scroll: int = 3, | ||
logger=None, | ||
) -> tuple[Result, bool]: | ||
"""Run the page analysis and return the result and a boolean indicating if the analysis was successful""" | ||
scraper = EcoindexScraper( | ||
url=str(url), | ||
window_size=window_size, | ||
wait_after_scroll=wait_after_scroll, | ||
wait_before_scroll=wait_before_scroll, | ||
page_load_timeout=20, | ||
) | ||
try: | ||
return (run(scraper.get_page_analysis()), True) | ||
except Exception as e: | ||
logger.error(f"{url} -- {e.msg if hasattr(e, 'msg') else e}") | ||
|
||
return ( | ||
Result( | ||
url=url, | ||
water=0, | ||
width=window_size.width, | ||
height=window_size.height, | ||
size=0, | ||
nodes=0, | ||
requests=0, | ||
), | ||
False, | ||
) | ||
from ecoindex.models import CliHost | ||
|
||
|
||
def replace_localhost_with_hostdocker(netloc: str) -> CliHost: | ||
if Settings().DOCKER_CONTAINER and "localhost" in netloc: | ||
domain = "host.docker.internal" | ||
netloc = netloc.replace("localhost", domain) | ||
elif "localhost" in netloc : | ||
elif "localhost" in netloc: | ||
domain = "localhost" | ||
else : | ||
else: | ||
domain = netloc | ||
|
||
return CliHost(domain=domain, netloc=netloc) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,74 @@ | ||
from asyncio import run | ||
from concurrent.futures import ThreadPoolExecutor, as_completed | ||
from typing import AsyncGenerator | ||
|
||
from ecoindex.models.compute import Result, WindowSize | ||
from ecoindex.scraper.scrap import EcoindexScraper | ||
|
||
|
||
def run_page_analysis( | ||
url: str, | ||
window_size: WindowSize, | ||
wait_after_scroll: int = 3, | ||
wait_before_scroll: int = 3, | ||
logger=None, | ||
) -> tuple[Result, bool]: | ||
"""Run the page analysis and return the result and a boolean indicating if the analysis was successful""" | ||
scraper = EcoindexScraper( | ||
url=str(url), | ||
window_size=window_size, | ||
wait_after_scroll=wait_after_scroll, | ||
wait_before_scroll=wait_before_scroll, | ||
page_load_timeout=20, | ||
) | ||
try: | ||
return (run(scraper.get_page_analysis()), True) | ||
except Exception as e: | ||
logger.error(f"{url} -- {e.msg if hasattr(e, 'msg') else e}") | ||
|
||
return ( | ||
Result( | ||
url=url, | ||
water=0, | ||
width=window_size.width, | ||
height=window_size.height, | ||
size=0, | ||
nodes=0, | ||
requests=0, | ||
), | ||
False, | ||
) | ||
|
||
|
||
async def bulk_analysis( | ||
max_workers, | ||
urls, | ||
window_sizes, | ||
wait_after_scroll: int = 0, | ||
wait_before_scroll: int = 0, | ||
logger=None, | ||
) -> AsyncGenerator[tuple[Result, bool], None]: | ||
with ThreadPoolExecutor(max_workers=max_workers) as executor: | ||
future_to_analysis = {} | ||
|
||
for url in urls: | ||
for window_size in window_sizes: | ||
future_to_analysis[ | ||
executor.submit( | ||
run_page_analysis, | ||
url, | ||
window_size, | ||
wait_after_scroll, | ||
wait_before_scroll, | ||
logger, | ||
) | ||
] = ( | ||
url, | ||
window_size, | ||
wait_after_scroll, | ||
wait_before_scroll, | ||
logger, | ||
) | ||
|
||
for future in as_completed(future_to_analysis): | ||
yield future.result() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,17 @@ | ||
import asyncio | ||
from pprint import pprint | ||
from uuid import uuid1 | ||
|
||
from ecoindex.models.compute import ScreenShot | ||
from ecoindex.scraper import EcoindexScraper | ||
|
||
pprint(asyncio.run(EcoindexScraper(url="http://ecoindex.fr").get_page_analysis())) | ||
scraper = EcoindexScraper( | ||
url="https://www.kiabi.com", | ||
screenshot=ScreenShot(id=str(uuid1()), folder="./screenshots"), | ||
) | ||
|
||
result = asyncio.run(scraper.get_page_analysis()) | ||
all_requests = asyncio.run(scraper.get_all_requests()) | ||
requests_by_category = asyncio.run(scraper.get_requests_by_category()) | ||
|
||
pprint(result) |
9f71f8d
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Branch coverage •