Skip to content

Commit

Permalink
jaddo
Browse files Browse the repository at this point in the history
  • Loading branch information
saraswatpuneet committed Sep 2, 2023
1 parent 00d1b86 commit 2450b79
Show file tree
Hide file tree
Showing 2 changed files with 57 additions and 2 deletions.
18 changes: 16 additions & 2 deletions querent/collectors/webscaper/web_scraper_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
from querent.config.collector_config import CollectorBackend, WebScraperConfig
from querent.common.uri import Uri
from querent.tools.web_page_extractor import WebpageExtractor
from urllib.parse import urlparse, urljoin


class WebScraperCollector(Collector):
Expand All @@ -24,8 +25,14 @@ async def disconnect(self):

async def poll(self):
async with self.poll_lock:
content = await self.scrape_website(self.website_url)
yield CollectedBytes(file=None, data=content.data, error=None)
urls_to_scrape = [self.website_url]
while urls_to_scrape:
url = urls_to_scrape.pop()
content = await self.scrape_website(url)
yield CollectedBytes(file=None, data=content.data, error=None)
# Find and add links from this page to the list of URLs to scrape
new_urls = self.extract_links(content.data, url)
urls_to_scrape.extend(new_urls)

async def scrape_website(self, website_url: str):
async with self.semaphore:
Expand All @@ -37,6 +44,13 @@ async def scrape_website(self, website_url: str):
data=content[:max_length], file=None, error=None
)

def extract_links(self, content: str, base_url: str):
# Use a proper HTML parser to extract links
extractor = WebpageExtractor()
links = extractor.extract_links(base_url)
# Join relative links with the base URL
return [urljoin(base_url, link) for link in links]


class WebScraperFactory(CollectorFactory):
def __init__(self):
Expand Down
41 changes: 41 additions & 0 deletions querent/tools/web_page_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -243,3 +243,44 @@ def extract_with_lxml(self, url):
f"Unknown error while extracting text from HTML (lxml): {str(e)}"
)
return ""

def extract_links(self, url):
"""
Extract internal links from a webpage.
Args:
url (str): The URL of the webpage to extract links from.
Returns:
list: A list of internal links (URLs).
"""
try:
headers = {"User-Agent": random.choice(USER_AGENTS)}
response = requests.get(url, headers=headers, timeout=10)
if response.status_code == 200:
soup = BeautifulSoup(response.text, "html.parser")
links = []
for link in soup.find_all("a", href=True):
link_href = link.get("href")
if (
link_href.startswith("/")
or link_href.startswith(".")
or link_href.startswith("#")
):
link_href = urljoin(url, link_href)
if (
link_href.startswith(url)
and link_href not in self.crawled_urls
):
links.append(link_href)
return links
else:
logger.error(
f"Error while extracting links from HTML (bs4): {response.status_code} for url - {url}"
)
return []
except Exception as e:
logger.error(
f"Unknown error while extracting links from HTML (bs4): {str(e)}"
)
return []

0 comments on commit 2450b79

Please sign in to comment.