-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathknowledge_integration.py
133 lines (107 loc) · 3.95 KB
/
knowledge_integration.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
import requests
import time
import os
import logging
from bs4 import BeautifulSoup
from openai import OpenAI
from dotenv import load_dotenv
# Set up logging
logging.basicConfig(level=logging.INFO)
# Ensure the w3 directory exists
os.makedirs("w3", exist_ok=True)
# Load environment variables from .env file
load_dotenv()
client = OpenAI(api_key=os.getenv("OPENAI_API_KEY"))
def sanitize_filename(url):
"""
Sanitizes the URL to create a safe filename.
Args:
url (str): The URL to sanitize.
Returns:
str: A sanitized filename.
"""
return url.replace("https://", "").replace("/", "_").replace(":", "_")
def scrape_and_summarize(url):
"""
Scrapes the specified URL, summarizes its content, and collects internal links.
Args:
url (str): The URL to scrape.
Returns:
dict: Dictionary containing the summary and internal links.
"""
try:
response = requests.get(url)
response.raise_for_status() # Ensure we notice bad responses
logging.info(f"Successfully fetched content from {url}")
soup = BeautifulSoup(response.text, 'html.parser')
text = soup.get_text()
internal_links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].startswith('/')]
# Log extracted text and links for debugging
logging.info(f"Extracted text from {url}: {text[:500]}...") # Log first 500 chars
logging.info(f"Found {len(internal_links)} internal links on {url}")
# Use GPT-4 to summarize the content
response = client.chat_completions.create(
model="gpt-4o",
messages=[
{"role": "user", "content": f"Summarize the following content:\n\n{text[:5000]}"}
],
max_tokens=1500
)
# Handle the response correctly
summary = response.choices[0].message['content']
logging.info(f"Generated summary for {url}")
return {
"summary": summary,
"internal_links": internal_links
}
except Exception as e:
logging.error(f"Error processing {url}: {e}")
return {
"summary": "",
"internal_links": []
}
def update_knowledge_base(urls):
"""
Scrapes the specified URLs, summarizes their content, and collects internal links.
Args:
urls (list): List of URLs to scrape.
Returns:
dict: Dictionary containing the URL as the key and the scraped text as the value.
"""
knowledge_base = {}
for url in urls:
data = scrape_and_summarize(url)
summary = data["summary"]
internal_links = data["internal_links"]
# Save the summary to a file in the w3 folder
file_name = os.path.join("w3", sanitize_filename(url) + "_summary.txt")
with open(file_name, "w", encoding="utf-8") as f:
f.write(summary)
# Save the internal links to a file in the w3 folder
links_file_name = os.path.join("w3", sanitize_filename(url) + "_links.txt")
with open(links_file_name, "w", encoding="utf-8") as f:
f.write("\n".join(internal_links))
knowledge_base[url] = summary
logging.info(f"Updated knowledge base for {url}")
return knowledge_base
def real_time_update(interval, urls):
"""
Continuously updates the knowledge base at the specified interval.
Args:
interval (int): Time interval (in seconds) for updating the knowledge base.
urls (list): List of URLs to scrape.
"""
while True:
knowledge_base = update_knowledge_base(urls)
logging.info("Knowledge base updated.")
time.sleep(interval)
# Example usage
urls = [
'https://github.com/mastermindml/mastermind',
'https://github.com/pythaiml/automindx',
'https://github.com/Professor-Codephreak',
'https://github.com/augml/lwe-plugin-shell',
'https://github.com/augml/nicegui'
]
interval = 3600 # Update every hour
real_time_update(interval, urls)