-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
239 lines (194 loc) · 12.3 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
import requests
import json
from duckduckgo_search import DDGS
import bs4
import time
import os
import http.server
def getWebsiteData(url):
print('Getting website data for: ', url)
response = requests.get(url)
soup = bs4.BeautifulSoup(response.text, 'html.parser')
# try to get the text in the main tag
try:
text = soup.find('main').get_text()
except:
text = soup.get_text()
# unescape html entities
# text = bs4.BeautifulSoup(text, 'html.parser').get_text()
# remove \\n
text = text.replace('\\n', '')
return text
def preparePrompt(query, websiteData):
print('Preparing prompt')
print('Query:', query)
# print('Website data:', websiteData)
prompt = {
"model": "llama3.2:3b",
"stream": False,
"format": "json",
"messages": [
{
"content": "You are an advanced, reliable, candid AI system that takes user search queries, converts them into questions, and answers them, using specific facts and details sourced from webpages to prove your answer. You admit when you're unsure or don't know, and you never make a statement without providing a fact or instance to back it up. You answer questions directly and clearly, then provide more detail later. You follow the JSON schema exactly.",
"role": "system"
},
{
"role": "system"
},
{
"role": "user",
}
]
}
DATE_TIME = time.strftime('%Y-%m-%d %H:%M:%S')
QUERY = query
HOST_1 = websiteData[0]['href']
MARKDOWN_1 = websiteData[0]['rawBody']
HOST_2 = websiteData[1]['href']
MARKDOWN_2 = websiteData[1]['rawBody']
HOST_3 = websiteData[2]['href']
MARKDOWN_3 = websiteData[2]['rawBody']
HOST_4 = websiteData[3]['href']
MARKDOWN_4 = websiteData[3]['rawBody']
HOST_5 = websiteData[4]['href']
MARKDOWN_5 = websiteData[4]['rawBody']
HOST_6 = ''
MARKDOWN_6 = ''
content1 = f"# CONTEXT\nCurrent date: {DATE_TIME}.\n\nHere are result from a web search for '{QUERY}':\nBEGIN WEB PAGE {HOST_1} {MARKDOWN_1}END WEB PAGE\nBEGIN WEB PAGE {HOST_2} {MARKDOWN_2}END WEB PAGE\nBEGIN WEB PAGE {HOST_3} {MARKDOWN_3}END WEB PAGE\nBEGIN WEB PAGE {HOST_4} {MARKDOWN_4}END WEB PAGE\nBEGIN WEB PAGE {HOST_5} {MARKDOWN_5}END WEB PAGE\nBEGIN WEB PAGE {HOST_6} {MARKDOWN_6}END WEB PAGE"
# replace \n\n\n with \n
content1 = content1.replace('\n\n\n', '\n')
content1 = content1.replace('\n\n\n', '\n')
content2 = f"## YOUR JOB\nThe user searched for: '{QUERY}'.\nAbove, I pasted text from some search results for this query.\nYou will:\n1. Take the user's query and infer what questions they want answered.\n2. Read the documents above and find relevant info that answers their questions. Ignore irrelevant results.\n3. Write a document, in precise JSON format as described below, that answers their inferred questions.\n4. Then, add headings and sections that provide more detail below.\n\nWrite your answer by summarizing the webpage data above and/or using your own knowledge.\nYour answer should be fact-filled and SPECIFIC, providing information like prices, review sentiment, dates, addresses, times, recipe instructions and ingredients with specific steps, times and amounts, timelines, characters, answers, features, comparisons, shipping times, related media.\nAvoid repeating text or concepts in your headings or bullets.\nStylistically write as though a Professor or The Economist would, in short, approachable, and professional language.\nDo not acknowledge specific webpage metadata; only quote page content and use its info.\nWhen asked to list entities, list 8-10 per section if possible.\n\n## INFERRING USER QUESTION\nHere are some examples of queries and how they were 'expanded' to infer what the user wants:\n\"notion company\" -> \"Provide facts and figures for Notion's size, products, funding, growth founders, and recent news. Add a section with a company timeline, and a section for key people.\"\n···basque dishes\" -> \"List and describe 8+ basque dishes in your initial answer. Then, add sections explaining the 4 key elements that define Basque cooking overall.···\n\"cook couscous\" -> \"First, provide an ordered list of 3-5 steps to cook couscous, then list all ingredients, tips for cooking and serving various kinds of couscous.\"\n\"die hard\" -> \"Tell me about the movie Die Hard, including plot, cast, reviews, and where to watch.\"\n\"perplexity series b\" -> \"With bullet points, tell me key details of Perplexity's Series B funding round like amount, key funders and goals. In the next sections, give me background on the company, competitors and industry.\"\n\"diff kindle scribe pens\" -> \"Compare and contrast the different Kindle Scribe pens, focusing on price, feature and other important differences.\"\n\"why is Goya famous\" -> \"Explain with specific examples, as an art historian would, why Goya is such an influential and famous painter, including the exact styles that made their work stand out relative to most artists. Include a section listing 5+ examples of their most famous works.\"\n\"white teeth book\" -> \"Provide a synopsis of the book White Teeth, details of the key themes covered, and the most celebrated or criticized aspects of the work.\"\n\"chinese restaurants park slope\" -> \"In your answer, list Chinese restaurants in Park Slope, with a brief description of why each is notable. Then, write headings + sections detailing the top 4's signature dishes, cuisine, prices and reviews.\"\n\"which is bigger, fanduel or draftkings\" -> \"Directly whether Draftkings or Fanduel is larger in the title. Then, provide concrete market share and users, followed by sections with a detailed history of both companies.\"\n\n## JSON Schema\n```\ninterface Response " + "{\n // metadata:\n inferredQuestion: string // In 1-2 sentences, what info do you think the user wants? Look at the examples above. Your answer and sections should answer this.\n imageSearchQuery: string // SPECIFIC 8 word image query to accompany the report\n tintColor: string // hex code of thematic color based on content\n title: string // name of topic, e.g. \"How to Cook Cauliflower\", \"Why is Goya famous?\" or \"China has more people than the US.\" If the query was a specifically-answerable question, answer it.\n\n // report content:\n answer: Section // 1st part of doc: directly answer the inferred question, list examples, or say you don't know. Write at least a few sentences. The title of the document should be short but the answer underneath should be in-depth.\n headings: string[4] // 2nd part: headings for 4 additional sections of your report, each 1-2 words. You can provide more detail on an item you talked about in `answer`, provide background knowledge, add a related list or timeline (plot points for a book, career for a person). Write a few sentences at least for each and go into more detail than you did in your answer.\n sections: [string: Section] // Fill out a section for each heading with 5-10 bullet points for each, always include a sentence or two with each bullet point.\n}\n\ninterface Section {\n emojiBullets: String[]; // Detailed bullet points listing entities, steps, facts, or examples. As many as necessary, ideally 5+. Use format \"[emoji] [title]: [detail]\". E.g. \"···· Price: $499 for base model\" or \"1······ Step 1: bring 2 cups water to boil\".\n source: 'webpage' | 'knowledge' // where did the info you just wrote come from?\n citedSubstring: string // If source=webpage, copy 1-2 sentences verbatim from the webpages to support what you wrote. Copy exactly: no translation, paraphraing, removal of syntax or special chars.\n}\n```\n\ntThe emojis should be in this format: :emojiname: e.g.: `:moon:` or `:calendar:`\n\nBelow, your report, following the JSON schema exactly:"
prompt['messages'][1]['content'] = content1
prompt['messages'][2]['content'] = content2
return prompt
def getAiResponse(prompt):
print('Getting AI response')
# e.g.:
# curl http://localhost:11434/api/chat -d '{
# "model": "llama3.2",
# "messages": [
# {
# "role": "user",
# "content": "why is the sky blue?"
# }
# ]
# }'
url = 'http://localhost:11434/api/chat'
headers = {
'Content-Type': 'application/json'
}
response = requests.post(url, headers=headers, json=prompt)
response = response.json()
# save response to a file
with open('response.json', 'w') as f:
json.dump(response, f)
jsonData = json.loads(response['message']['content'])
with open('1.json', 'w') as f:
json.dump(jsonData, f)
return jsonData
# STEPS
def step1(query):
try:
# get search results
results = DDGS().text(query, max_results=5)
except Exception as e:
print('Error:', e)
if '202 Ratelimit' in str(e):
print('You have reached the maximum number of requests per minute. Please try again later.')
exit()
# save results to a file
with open('results.json', 'w') as f:
json.dump(results, f)
# read results from a file
with open('results.json', 'r') as f:
results = json.load(f)
return results
def step2(results):
websiteData = []
# merge results and website data (put websitedata in "rawBody" in results)
print('Getting data from websites')
for result in results:
websiteData.append({
'href': result['href'],
'title': result['title'],
'rawBody': getWebsiteData(result['href'])
})
return websiteData
def step3(jsonData):
imageQuery = jsonData['imageSearchQuery']
print('Getting images for:', imageQuery)
try:
images = DDGS().images(imageQuery, max_results=5)
except Exception as e:
print('Error:', e)
if '202 Ratelimit' in str(e):
print('You have reached the maximum number of requests per minute. Please try again later.')
exit()
# save images to a file
with open('images.json', 'w') as f:
json.dump(images, f)
return images
def main(query='Who created GitHub?'):
try:
# remove every .json file in the directory
for file in os.listdir():
if file.endswith('.json'):
os.remove(file)
except:
pass
print('Getting search results for:', query)
results = step1(query)
# get website data
websiteData = step2(results)
prompt = preparePrompt(query, websiteData)
# save prompt to a file
with open('prompt.json', 'w') as f:
json.dump(prompt, f)
print('Getting data from AI')
jsonData = getAiResponse(prompt)
# check if the response is valid
if 'title' not in jsonData or 'answer' not in jsonData or 'sections' not in jsonData:
# repeat the process max 3 times
for i in range(3):
print('Invalid response. Trying again...')
jsonData = getAiResponse(prompt)
if 'title' in jsonData and 'answer' in jsonData and 'sections' in jsonData:
break
else:
print('Invalid response. Try again later.')
# add the sources to the data.json file
# sources: [
# {
# "url": "https://www.example.com",
# "title": "Example"
# },...
# ]
sources = []
for data in websiteData:
sources.append({
'url': data['href'],
'title': data['title']
})
jsonData['sources'] = sources
with open('2.json', 'w') as f:
json.dump(jsonData, f)
# search for images
images = step3(jsonData)
# add imageUrls to the data.json file
with open('data.json', 'w') as f:
jsonData['imageUrls'] = images
json.dump(jsonData, f)
print('Done')
# host the files using a simple http server and open the browser
PORT = 8000
Handler = http.server.SimpleHTTPRequestHandler
# open localhost:8000 using the default browser
os.system(f'start http://localhost:{PORT}/report.html')
with http.server.HTTPServer(("", PORT), Handler) as httpd:
print("serving at port", PORT)
httpd.serve_forever()
if __name__ == "__main__":
print('Enter your query:')
query = input()
main(query)