Skip to content

Commit

Permalink
Merge branch 'ayush' of https://github.com/Querent-ai/querent-ai into…
Browse files Browse the repository at this point in the history
… ayush
  • Loading branch information
the-non-expert committed Aug 29, 2023
2 parents 3f0c2da + ede0117 commit c1e6ac8
Show file tree
Hide file tree
Showing 5 changed files with 8 additions and 5 deletions.
4 changes: 2 additions & 2 deletions querent/collectors/webscaper/web_scraper_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,8 @@ async def poll(self):

async def scrape_website(self, website_url: str):
content = WebpageExtractor().extract_with_bs4(website_url)
max_length = len(" ".join(content.split(" ")[:600]))
return CollectedBytes(data=content[:max_length], file=None, error=None)
max_length = len(' '.join(content.split(" ")[:600]))
return CollectorResult({"content": content[:max_length]})


class WebScraperFactory(CollectorFactory):
Expand Down
2 changes: 1 addition & 1 deletion querent/common/uri.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ class Protocol(enum.Enum):
PostgreSQL = "postgresql"
Ram = "ram"
S3 = "s3"
Webscraper = "webscraper"
Webscraper = "https"

def is_azure(self) -> bool:
return self == Protocol.Azure
Expand Down
1 change: 0 additions & 1 deletion querent/ingestors/pdf_ingestor.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
import PyPDF2
import pypdf


Expand Down
2 changes: 1 addition & 1 deletion querent/tools/web_page_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def extract_with_3k(self, url):
article.set_html(html_content)
article.parse()
content = article.text.replace(
"\t", " ").replace("\n", " ").strip()
'\t', ' ').replace('\n', ' ').strip()

return content[:1500]

Expand Down
4 changes: 4 additions & 0 deletions tests/test_webscrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,8 +29,12 @@ def test_scrapping_data():
collector = resolver.resolve(uri, webscrapperConfig)
assert collector is not None

print("REached here")

async def poll_and_print():
print("Part 2")
async for result in collector.poll():
print("Hola...")
assert not result.is_error()
print(result.unwrap())

Expand Down

0 comments on commit c1e6ac8

Please sign in to comment.