Skip to content

Commit 6ad2e18

Browse files
committed
[SCRAPER]: include title on scraped
1 parent a0e7cfc commit 6ad2e18

File tree

3 files changed

+25
-7
lines changed

3 files changed

+25
-7
lines changed

jaseci_ai_kit/jac_misc/jac_misc/scraper/async_scraper.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,13 @@ async def scrape(
9494
pages: list, pre_configs: list = [], detailed: bool = False, target: str = None
9595
):
9696
content = ""
97-
urls = {"scanned": [], "scanned_urls": set(), "scraped": [], "crawled": set()}
97+
urls = {
98+
"url": pages[0]["goto"]["url"],
99+
"scanned": [],
100+
"scanned_urls": set(),
101+
"scraped": {},
102+
"crawled": set(),
103+
}
98104

99105
ws = Client()
100106
trigger_id = uuid4()
@@ -140,6 +146,7 @@ async def scrape(
140146

141147
if detailed:
142148
return {
149+
"url": urls["url"],
143150
"content": content,
144151
"scanned": urls["scanned"],
145152
"scraped": urls["scraped"],
@@ -202,7 +209,9 @@ async def getters(page: Page, specss: list[dict], urls: dict):
202209
if expression:
203210
print(f"[getters]: getting content from {page.url}")
204211
content += await page.evaluate(f"() =>{expression}")
205-
add_url(page, urls, expression)
212+
add_url(page, urls, await page.title())
213+
else:
214+
add_url(page, urls)
206215

207216
await run_scripts(page, post, urls)
208217

jaseci_ai_kit/jac_misc/jac_misc/scraper/sync_scraper.py

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -45,7 +45,13 @@ def scrape(
4545
pages: list, pre_configs: list = [], detailed: bool = False, target: str = None
4646
):
4747
content = ""
48-
urls = {"scanned": [], "scanned_urls": set(), "scraped": [], "crawled": set()}
48+
urls = {
49+
"url": pages[0]["goto"]["url"],
50+
"scanned": [],
51+
"scanned_urls": set(),
52+
"scraped": {},
53+
"crawled": set(),
54+
}
4955

5056
trigger_id = uuid4()
5157

@@ -89,6 +95,7 @@ def scrape(
8995

9096
if detailed:
9197
return {
98+
"url": urls["url"],
9299
"content": content,
93100
"scanned": urls["scanned"],
94101
"scraped": urls["scraped"],
@@ -150,7 +157,9 @@ def getters(page: Page, specss: list[dict], urls: dict):
150157
if expression:
151158
print(f"[getters]: getting content from {page.url}")
152159
content += page.evaluate(f"() =>{expression}")
153-
add_url(page, urls, expression)
160+
add_url(page, urls, page.title())
161+
else:
162+
add_url(page, urls)
154163

155164
run_scripts(page, post, urls)
156165

jaseci_ai_kit/jac_misc/jac_misc/scraper/utils.py

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,7 @@
22
from copy import deepcopy
33

44

5-
def add_url(page, urls: dict, scraped: bool = False, error: str = None):
5+
def add_url(page, urls: dict, title: str = None, error: str = None):
66
url = page.url
77
source = page.source
88
if url:
@@ -16,8 +16,8 @@ def add_url(page, urls: dict, scraped: bool = False, error: str = None):
1616
scan["source"] = source
1717
urls["scanned"].append(scan)
1818

19-
if scraped and url not in urls["scraped"]:
20-
urls["scraped"].append(url)
19+
if title and url not in urls["scraped"]:
20+
urls["scraped"][url] = title
2121

2222

2323
def add_crawl(pages: list, pre_configs: list, urls: dict, url: str, def_crawl: dict):

0 commit comments

Comments
 (0)