Skip to content

Commit eb9c049

Browse files
committed
make questions
1 parent ce50b86 commit eb9c049

13 files changed

+1054
-944
lines changed

.gitignore

+2-1
Original file line numberDiff line numberDiff line change
@@ -1,2 +1,3 @@
11
/uit-chatbot
2-
/langchain-rag-tutorial
2+
/langchain-rag-tutorial
3+
.env

app.py

+1-1
Original file line numberDiff line numberDiff line change
@@ -99,7 +99,7 @@ def answer_query(query, context):
9999
Answer a query using a language model with the given context.
100100
"""
101101
try:
102-
api_key = os.getenv("GOOGLE_API_KEY",default="AIzaSyBfdagFw6mZF02sgemJzCI2OoXikNERnTc") # Replace "default-key" with actual default or raise error
102+
api_key = os.getenv("GOOGLE_API_KEY") # Replace "default-key" with actual default or raise error
103103
llm = ChatGoogleGenerativeAI(
104104
model="gemini-1.5-pro",
105105
temperature=0,

make_question.ipynb

+308
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,308 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type": "code",
5+
"execution_count": 65,
6+
"metadata": {},
7+
"outputs": [],
8+
"source": [
9+
"import docx\n",
10+
"\n",
11+
"def make_questions(path):\n",
12+
" #read docx file\n",
13+
" doc = docx.Document(path)\n",
14+
" # Danh sách chứa các câu hỏi\n",
15+
" questions = []\n",
16+
" current_question = \"\"\n",
17+
" # Lặp qua từng đoạn văn trong file DOCX\n",
18+
" for paragraph in doc.paragraphs:\n",
19+
" text = paragraph.text.strip()\n",
20+
" \n",
21+
" # Nếu bắt đầu bằng \"Câu\", đó là câu hỏi mới\n",
22+
" if text.startswith(\"Câu\"):\n",
23+
" if current_question: # Nếu đã có câu hỏi trước đó, thêm vào danh sách\n",
24+
" questions.append([current_question[7:]])\n",
25+
" current_question = text # Bắt đầu câu hỏi mới\n",
26+
" elif current_question: # Nếu không phải câu hỏi, tiếp tục thêm đáp án\n",
27+
" current_question += \" \" + text\n",
28+
"\n",
29+
" # Thêm câu hỏi cuối cùng (nếu có)\n",
30+
" if current_question:\n",
31+
" questions.append([current_question.strip()])\n",
32+
" return questions"
33+
]
34+
},
35+
{
36+
"cell_type": "code",
37+
"execution_count": 58,
38+
"metadata": {},
39+
"outputs": [],
40+
"source": [
41+
"from docx import Document\n",
42+
"# Path to the DOCX file\n",
43+
"def make_answers(path):\n",
44+
" # Open the Word document\n",
45+
" document = Document(path)\n",
46+
"\n",
47+
" # List to store underlined text\n",
48+
" underlined_text = []\n",
49+
" current_underlined = \"\" # To handle multi-line underlined text\n",
50+
"\n",
51+
" # Loop through all paragraphs and their runs\n",
52+
" for paragraph in document.paragraphs:\n",
53+
" for run in paragraph.runs:\n",
54+
" if run.underline: # Check if the run is underlined\n",
55+
" current_underlined += run.text.strip() + \"\" # Append text and keep it continuous\n",
56+
" else:\n",
57+
" if current_underlined: # If switching from underlined to non-underlined text\n",
58+
" underlined_text.append(current_underlined.strip())\n",
59+
" current_underlined = \"\" # Reset the temporary variable\n",
60+
"\n",
61+
" # Add any remaining underlined text\n",
62+
" if current_underlined:\n",
63+
" underlined_text.append(current_underlined.strip())\n",
64+
"\n",
65+
" # Return the list of underlined text\n",
66+
" return underlined_text\n"
67+
]
68+
},
69+
{
70+
"cell_type": "code",
71+
"execution_count": 95,
72+
"metadata": {},
73+
"outputs": [
74+
{
75+
"name": "stderr",
76+
"output_type": "stream",
77+
"text": [
78+
"<>:6: SyntaxWarning: invalid escape sequence '\\c'\n",
79+
"<>:6: SyntaxWarning: invalid escape sequence '\\c'\n",
80+
"C:\\Users\\APTS\\AppData\\Local\\Temp\\ipykernel_17312\\437966213.py:6: SyntaxWarning: invalid escape sequence '\\c'\n",
81+
" with open('test-data\\chu-nghia-xa-hoi-khoa-hoc-questions.csv', 'w', newline='') as file:\n"
82+
]
83+
}
84+
],
85+
"source": [
86+
"path = \"D:/533463523-NGAN-HANG-CAU-HỎI-TRẮC-NGHIỆM-MON-CHỦ-NGHĨA-XA-HỘI-KHOA-HỌC.docx\"\n",
87+
"questions = make_questions(path)\n",
88+
"answers = make_answers(path)\n",
89+
"# make csv file have 2 column \"cau hoi\" and \"dap an\"\n",
90+
"import csv\n",
91+
"with open('test-data\\chu-nghia-xa-hoi-khoa-hoc-questions.csv', 'w', newline='') as file:\n",
92+
" writer = csv.writer(file)\n",
93+
" writer.writerow([\"Question\", \"Answer\"])\n",
94+
" for i in range(len(questions)):\n",
95+
" writer.writerow([questions[i][0], answers[i]])"
96+
]
97+
},
98+
{
99+
"cell_type": "code",
100+
"execution_count": 61,
101+
"metadata": {},
102+
"outputs": [
103+
{
104+
"data": {
105+
"text/plain": [
106+
"(4, 0)"
107+
]
108+
},
109+
"execution_count": 61,
110+
"metadata": {},
111+
"output_type": "execute_result"
112+
}
113+
],
114+
"source": [
115+
"len(questions),len(answers)"
116+
]
117+
},
118+
{
119+
"cell_type": "code",
120+
"execution_count": 66,
121+
"metadata": {},
122+
"outputs": [],
123+
"source": [
124+
"questions= make_questions(path)"
125+
]
126+
},
127+
{
128+
"cell_type": "code",
129+
"execution_count": 92,
130+
"metadata": {},
131+
"outputs": [
132+
{
133+
"name": "stdout",
134+
"output_type": "stream",
135+
"text": [
136+
"Dữ liệu đã được lưu vào file CSV.\n"
137+
]
138+
},
139+
{
140+
"name": "stderr",
141+
"output_type": "stream",
142+
"text": [
143+
"<>:21: SyntaxWarning: invalid escape sequence '\\P'\n",
144+
"<>:21: SyntaxWarning: invalid escape sequence '\\P'\n",
145+
"C:\\Users\\APTS\\AppData\\Local\\Temp\\ipykernel_17312\\77413453.py:21: SyntaxWarning: invalid escape sequence '\\P'\n",
146+
" with open(\"test-data\\Phap_Luat_Trac_Nghiem.csv\", \"w\", newline=\"\", encoding=\"utf-8\") as file:\n"
147+
]
148+
}
149+
],
150+
"source": [
151+
"import re\n",
152+
"\n",
153+
"# Đọc nội dung từ file DOCX\n",
154+
"doc = docx.Document(\"D:/pldc.docx\")\n",
155+
"text = \"\\n\".join([paragraph.text for paragraph in doc.paragraphs])\n",
156+
"# Tách câu hỏi dựa trên từ khóa \"Câu \"\"Đáp án\"\n",
157+
"questions = re.split(r\"(?=Câu \\d+\\. .*?)\", text)\n",
158+
"\n",
159+
"# Xử lý từng câu hỏi\n",
160+
"data = []\n",
161+
"for question in questions:\n",
162+
" if question.strip(): # Bỏ qua các đoạn trống\n",
163+
" match = re.search(r\"(Câu \\d+\\. .*?)\", question, re.DOTALL)\n",
164+
" if match:\n",
165+
" full_question = match.group(1).strip()\n",
166+
" correct_answer = match.group(2).strip()\n",
167+
" data.append({\"Question\": full_question})\n",
168+
"\n",
169+
"# Lưu dữ liệu vào file CSV\n",
170+
"import csv\n",
171+
"with open(\"test-data\\Phap_Luat_Trac_Nghiem.csv\", \"w\", newline=\"\", encoding=\"utf-8\") as file:\n",
172+
" writer = csv.DictWriter(file, fieldnames=[\"Question\"])\n",
173+
" writer.writeheader()\n",
174+
" for row in data:\n",
175+
" writer.writerow(row)\n",
176+
"print(\"Dữ liệu đã được lưu vào file CSV.\")\n"
177+
]
178+
},
179+
{
180+
"cell_type": "code",
181+
"execution_count": 134,
182+
"metadata": {},
183+
"outputs": [
184+
{
185+
"name": "stderr",
186+
"output_type": "stream",
187+
"text": [
188+
"<>:7: SyntaxWarning: invalid escape sequence '\\T'\n",
189+
"<>:7: SyntaxWarning: invalid escape sequence '\\T'\n",
190+
"C:\\Users\\APTS\\AppData\\Local\\Temp\\ipykernel_17312\\4145004070.py:7: SyntaxWarning: invalid escape sequence '\\T'\n",
191+
" output_csv = \"test-data\\Tu_Tuong_HCM_Trac_Nghiem.csv\" # File xuất CSV\n"
192+
]
193+
},
194+
{
195+
"name": "stdout",
196+
"output_type": "stream",
197+
"text": [
198+
"Dữ liệu đã được tách và lưu vào test-data\\Tu_Tuong_HCM_Trac_Nghiem.csv.\n"
199+
]
200+
}
201+
],
202+
"source": [
203+
"from docx import Document\n",
204+
"import csv\n",
205+
"import re\n",
206+
"\n",
207+
"# Đường dẫn file\n",
208+
"input_docx = \"D:/tthcm.docx\" # Thay bằng đường dẫn thực tế\n",
209+
"output_csv = \"test-data\\Tu_Tuong_HCM_Trac_Nghiem.csv\" # File xuất CSV\n",
210+
"\n",
211+
"# Mở file DOCX\n",
212+
"document = Document(input_docx)\n",
213+
"\n",
214+
"# Biến lưu trữ\n",
215+
"data = []\n",
216+
"current_question = \"\"\n",
217+
"answers = []\n",
218+
"\n",
219+
"# Hàm kiểm tra xem đoạn văn có thuộc danh sách không\n",
220+
"def is_list_item(paragraph):\n",
221+
" return paragraph.style.name.startswith(\"List\") # Kiểm tra nếu paragraph thuộc danh sách\n",
222+
"\n",
223+
"# Đọc từng đoạn văn trong file DOCX\n",
224+
"for paragraph in document.paragraphs:\n",
225+
" text = paragraph.text.strip()\n",
226+
"\n",
227+
" if is_list_item(paragraph):\n",
228+
" # Kiểm tra nếu là đáp án (bắt đầu bằng a), b), ...)\n",
229+
" if re.match(r\"^[a-d]\\)\\s\", text):\n",
230+
" answers.append(text)\n",
231+
" elif text: # Coi là câu hỏi nếu không khớp với định dạng đáp án\n",
232+
" # Lưu câu hỏi trước đó (nếu có)\n",
233+
" if current_question and answers:\n",
234+
" correct_answer = next((a.replace(\"(đ)\", \"\").strip() for a in answers if \"(đ)\" in a), \"\")\n",
235+
" data.append([current_question.strip(), correct_answer])\n",
236+
" answers = [] # Reset danh sách đáp án\n",
237+
" current_question = text # Cập nhật câu hỏi mới\n",
238+
" elif text: # Đoạn văn không thuộc danh sách (xử lý câu hỏi độc lập)\n",
239+
" if current_question and answers:\n",
240+
" correct_answer = next((a.replace(\"(đ)\", \"\").strip() for a in answers if \"(đ)\" in a), \"\")\n",
241+
" data.append([current_question.strip(), correct_answer])\n",
242+
" current_question = \"\"\n",
243+
" answers = []\n",
244+
"\n",
245+
"# Xử lý câu hỏi cuối cùng\n",
246+
"if current_question and answers:\n",
247+
" correct_answer = next((a.replace(\"(đ)\", \"\").strip() for a in answers if \"(đ)\" in a), \"\")\n",
248+
" data.append([current_question.strip(), correct_answer])\n",
249+
"\n",
250+
"# Ghi dữ liệu vào file CSV\n",
251+
"with open(output_csv, mode=\"w\", encoding=\"utf-8\", newline=\"\") as file:\n",
252+
" writer = csv.writer(file)\n",
253+
" writer.writerow([\"Question\", \"Answer\"]) # Tiêu đề cột\n",
254+
" writer.writerows(data)\n",
255+
"\n",
256+
"print(f\"Dữ liệu đã được tách và lưu vào {output_csv}.\")\n"
257+
]
258+
},
259+
{
260+
"cell_type": "code",
261+
"execution_count": 131,
262+
"metadata": {},
263+
"outputs": [
264+
{
265+
"data": {
266+
"text/plain": [
267+
"[]"
268+
]
269+
},
270+
"execution_count": 131,
271+
"metadata": {},
272+
"output_type": "execute_result"
273+
}
274+
],
275+
"source": [
276+
"answers"
277+
]
278+
},
279+
{
280+
"cell_type": "code",
281+
"execution_count": null,
282+
"metadata": {},
283+
"outputs": [],
284+
"source": []
285+
}
286+
],
287+
"metadata": {
288+
"kernelspec": {
289+
"display_name": "base",
290+
"language": "python",
291+
"name": "python3"
292+
},
293+
"language_info": {
294+
"codemirror_mode": {
295+
"name": "ipython",
296+
"version": 3
297+
},
298+
"file_extension": ".py",
299+
"mimetype": "text/x-python",
300+
"name": "python",
301+
"nbconvert_exporter": "python",
302+
"pygments_lexer": "ipython3",
303+
"version": "3.12.4"
304+
}
305+
},
306+
"nbformat": 4,
307+
"nbformat_minor": 2
308+
}
Binary file not shown.

0 commit comments

Comments
 (0)