Skip to content

Commit 4f87fc8

Browse files
committed
[SCRAPER]: include title on scraped
1 parent a0e7cfc commit 4f87fc8

File tree

3 files changed

+23
-7
lines changed

3 files changed

+23
-7
lines changed

jaseci_ai_kit/jac_misc/jac_misc/scraper/async_scraper.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,13 @@ async def scrape(
9494
pages: list, pre_configs: list = [], detailed: bool = False, target: str = None
9595
):
9696
content = ""
97-
urls = {"scanned": [], "scanned_urls": set(), "scraped": [], "crawled": set()}
97+
urls = {
98+
"url": pages[0]["goto"]["url"],
99+
"scanned": [],
100+
"scanned_urls": set(),
101+
"scraped": {},
102+
"crawled": set(),
103+
}
98104

99105
ws = Client()
100106
trigger_id = uuid4()
@@ -202,7 +208,9 @@ async def getters(page: Page, specss: list[dict], urls: dict):
202208
if expression:
203209
print(f"[getters]: getting content from {page.url}")
204210
content += await page.evaluate(f"() =>{expression}")
205-
add_url(page, urls, expression)
211+
add_url(page, urls, await page.title())
212+
else:
213+
add_url(page, urls)
206214

207215
await run_scripts(page, post, urls)
208216

jaseci_ai_kit/jac_misc/jac_misc/scraper/sync_scraper.py

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,13 @@ def scrape(
4545
pages: list, pre_configs: list = [], detailed: bool = False, target: str = None
4646
):
4747
content = ""
48-
urls = {"scanned": [], "scanned_urls": set(), "scraped": [], "crawled": set()}
48+
urls = {
49+
"url": pages[0]["goto"]["url"],
50+
"scanned": [],
51+
"scanned_urls": set(),
52+
"scraped": {},
53+
"crawled": set(),
54+
}
4955

5056
trigger_id = uuid4()
5157

@@ -150,7 +156,9 @@ def getters(page: Page, specss: list[dict], urls: dict):
150156
if expression:
151157
print(f"[getters]: getting content from {page.url}")
152158
content += page.evaluate(f"() =>{expression}")
153-
add_url(page, urls, expression)
159+
add_url(page, urls, page.title())
160+
else:
161+
add_url(page, urls)
154162

155163
run_scripts(page, post, urls)
156164

jaseci_ai_kit/jac_misc/jac_misc/scraper/utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from copy import deepcopy
33

44

5-
def add_url(page, urls: dict, scraped: bool = False, error: str = None):
5+
def add_url(page, urls: dict, title: str = None, error: str = None):
66
url = page.url
77
source = page.source
88
if url:
@@ -16,8 +16,8 @@ def add_url(page, urls: dict, scraped: bool = False, error: str = None):
1616
scan["source"] = source
1717
urls["scanned"].append(scan)
1818

19-
if scraped and url not in urls["scraped"]:
20-
urls["scraped"].append(url)
19+
if title and url not in urls["scraped"]:
20+
urls["scraped"][url] = title
2121

2222

2323
def add_crawl(pages: list, pre_configs: list, urls: dict, url: str, def_crawl: dict):

0 commit comments

Comments
 (0)