-
Notifications
You must be signed in to change notification settings - Fork 1
/
Tool.py
40 lines (29 loc) · 1.38 KB
/
Tool.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
from bs4 import BeautifulSoup
from langchain_community.document_loaders.recursive_url_loader import RecursiveUrlLoader
# Define the URL of your college website
url = "https://www.mietjmu.in/"
# Define a custom extractor function to extract text from HTML using BeautifulSoup
def custom_extractor(html_content):
soup = BeautifulSoup(html_content, "html.parser")
return soup.get_text()
# Instantiate the RecursiveUrlLoader
loader = RecursiveUrlLoader(url=url, extractor=custom_extractor)
# Load the data from the website
docs = loader.load()
# Define the file path to store the data
output_file = "scraped__data.txt"
# Open the file in write mode with UTF-8 encoding
with open(output_file, "w", encoding="utf-8") as file:
# Write metadata and content for each document to the file
for doc in docs:
title = doc.metadata.get("title")
source = doc.metadata.get("source")
content = doc.page_content
# Ensure that the title, source, and content are string type
if isinstance(title, str) and isinstance(source, str) and isinstance(content, str):
file.write("Page Title: " + title + "\n")
file.write("Page URL: " + source + "\n")
file.write("Page Content:\n" + content + "\n\n")
else:
print("Skipped a document due to non-string content.")
print("Data has been successfully written to", output_file)