Skip to content

Commit

Permalink
add rate limiting to webscraper
Browse files Browse the repository at this point in the history
  • Loading branch information
saraswatpuneet committed Sep 2, 2023
1 parent 8cd2553 commit afb83e3
Show file tree
Hide file tree
Showing 2 changed files with 18 additions and 6 deletions.
24 changes: 18 additions & 6 deletions querent/collectors/webscaper/web_scraper_collector.py
Original file line number Diff line number Diff line change
@@ -1,14 +1,20 @@
import asyncio
from aiohttp import ClientSession, TCPConnector
from querent.collectors.collector_base import Collector
from querent.collectors.collector_factory import CollectorFactory
from querent.common.types.collected_bytes import CollectedBytes
from querent.config.collector_config import CollectorBackend, WebScraperConfig
from querent.tools.web_page_extractor import WebpageExtractor
from querent.common.uri import Uri
from querent.tools.web_page_extractor import WebpageExtractor


class WebScraperCollector(Collector):
def __init__(self, config: WebScraperConfig):
self.website_url = config.website_url
self.semaphore = asyncio.Semaphore(
5
) # Adjust the limit as needed (e.g., 5 requests at a time)
self.poll_lock = asyncio.Lock() # Lock for the poll method

async def connect(self):
pass # Any setup logic before scraping
Expand All @@ -17,13 +23,19 @@ async def disconnect(self):
pass # Any cleanup logic after scraping

async def poll(self):
content = await self.scrape_website(self.website_url)
yield CollectedBytes(file=None, data=content.data, error=None)
async with self.poll_lock:
content = await self.scrape_website(self.website_url)
yield CollectedBytes(file=None, data=content.data, error=None)

async def scrape_website(self, website_url: str):
content = WebpageExtractor().extract_with_bs4(website_url)
max_length = len(" ".join(content.split(" ")[:600]))
return CollectedBytes(data=content[:max_length], file=None, error=None)
async with self.semaphore:
async with ClientSession(connector=TCPConnector(ssl=False)) as session:
async with session.get(website_url) as response:
content = await response.text()
max_length = len(" ".join(content.split(" ")[:600]))
return CollectedBytes(
data=content[:max_length], file=None, error=None
)


class WebScraperFactory(CollectorFactory):
Expand Down
File renamed without changes.

0 comments on commit afb83e3

Please sign in to comment.