dinhthienan33
diff --git a/‎.gitignore
+2-1 b/‎.gitignore
+2-1
diff --git a/‎app.py
+1-1 b/‎app.py
+1-1
diff --git a/‎make_question.ipynb
+308 b/‎make_question.ipynb
+308
diff --git a/‎test-data/PHAP_LUAT_DAI_CUONG-đã mở khóa.pdf
1.33 MB b/‎test-data/PHAP_LUAT_DAI_CUONG-đã mở khóa.pdf
1.33 MB
@@ -1,2 +1,3 @@
 /uit-chatbot
-/langchain-rag-tutorial
+/langchain-rag-tutorial
+.env
@@ -99,7 +99,7 @@ def answer_query(query, context):
     Answer a query using a language model with the given context.
     """
     try:
-        api_key = os.getenv("GOOGLE_API_KEY",default="AIzaSyBfdagFw6mZF02sgemJzCI2OoXikNERnTc")  # Replace "default-key" with actual default or raise error
+        api_key = os.getenv("GOOGLE_API_KEY")  # Replace "default-key" with actual default or raise error
         llm = ChatGoogleGenerativeAI(
             model="gemini-1.5-pro",
             temperature=0,
 
@@ -0,0 +1,308 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import docx\n",
+    "\n",
+    "def make_questions(path):\n",
+    "    #read docx file\n",
+    "    doc = docx.Document(path)\n",
+    "    # Danh sách chứa các câu hỏi\n",
+    "    questions = []\n",
+    "    current_question = \"\"\n",
+    "    # Lặp qua từng đoạn văn trong file DOCX\n",
+    "    for paragraph in doc.paragraphs:\n",
+    "        text = paragraph.text.strip()\n",
+    "        \n",
+    "        # Nếu bắt đầu bằng \"Câu\", đó là câu hỏi mới\n",
+    "        if text.startswith(\"Câu\"):\n",
+    "            if current_question:  # Nếu đã có câu hỏi trước đó, thêm vào danh sách\n",
+    "                questions.append([current_question[7:]])\n",
+    "            current_question = text  # Bắt đầu câu hỏi mới\n",
+    "        elif current_question:  # Nếu không phải câu hỏi, tiếp tục thêm đáp án\n",
+    "            current_question += \" \" + text\n",
+    "\n",
+    "    # Thêm câu hỏi cuối cùng (nếu có)\n",
+    "    if current_question:\n",
+    "        questions.append([current_question.strip()])\n",
+    "    return questions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from docx import Document\n",
+    "# Path to the DOCX file\n",
+    "def make_answers(path):\n",
+    "    # Open the Word document\n",
+    "    document = Document(path)\n",
+    "\n",
+    "    # List to store underlined text\n",
+    "    underlined_text = []\n",
+    "    current_underlined = \"\"  # To handle multi-line underlined text\n",
+    "\n",
+    "    # Loop through all paragraphs and their runs\n",
+    "    for paragraph in document.paragraphs:\n",
+    "        for run in paragraph.runs:\n",
+    "            if run.underline:  # Check if the run is underlined\n",
+    "                current_underlined += run.text.strip() + \"\"  # Append text and keep it continuous\n",
+    "            else:\n",
+    "                if current_underlined:  # If switching from underlined to non-underlined text\n",
+    "                    underlined_text.append(current_underlined.strip())\n",
+    "                    current_underlined = \"\"  # Reset the temporary variable\n",
+    "\n",
+    "    # Add any remaining underlined text\n",
+    "    if current_underlined:\n",
+    "        underlined_text.append(current_underlined.strip())\n",
+    "\n",
+    "    # Return the list of underlined text\n",
+    "    return underlined_text\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 95,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<>:6: SyntaxWarning: invalid escape sequence '\\c'\n",
+      "<>:6: SyntaxWarning: invalid escape sequence '\\c'\n",
+      "C:\\Users\\APTS\\AppData\\Local\\Temp\\ipykernel_17312\\437966213.py:6: SyntaxWarning: invalid escape sequence '\\c'\n",
+      "  with open('test-data\\chu-nghia-xa-hoi-khoa-hoc-questions.csv', 'w', newline='') as file:\n"
+     ]
+    }
+   ],
+   "source": [
+    "path = \"D:/533463523-NGAN-HANG-CAU-HỎI-TRẮC-NGHIỆM-MON-CHỦ-NGHĨA-XA-HỘI-KHOA-HỌC.docx\"\n",
+    "questions = make_questions(path)\n",
+    "answers = make_answers(path)\n",
+    "# make csv file have 2 column \"cau hoi\" and \"dap an\"\n",
+    "import csv\n",
+    "with open('test-data\\chu-nghia-xa-hoi-khoa-hoc-questions.csv', 'w', newline='') as file:\n",
+    "    writer = csv.writer(file)\n",
+    "    writer.writerow([\"Question\", \"Answer\"])\n",
+    "    for i in range(len(questions)):\n",
+    "        writer.writerow([questions[i][0], answers[i]])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(4, 0)"
+      ]
+     },
+     "execution_count": 61,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(questions),len(answers)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "questions= make_questions(path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 92,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dữ liệu đã được lưu vào file CSV.\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<>:21: SyntaxWarning: invalid escape sequence '\\P'\n",
+      "<>:21: SyntaxWarning: invalid escape sequence '\\P'\n",
+      "C:\\Users\\APTS\\AppData\\Local\\Temp\\ipykernel_17312\\77413453.py:21: SyntaxWarning: invalid escape sequence '\\P'\n",
+      "  with open(\"test-data\\Phap_Luat_Trac_Nghiem.csv\", \"w\", newline=\"\", encoding=\"utf-8\") as file:\n"
+     ]
+    }
+   ],
+   "source": [
+    "import re\n",
+    "\n",
+    "# Đọc nội dung từ file DOCX\n",
+    "doc = docx.Document(\"D:/pldc.docx\")\n",
+    "text = \"\\n\".join([paragraph.text for paragraph in doc.paragraphs])\n",
+    "# Tách câu hỏi dựa trên từ khóa \"Câu \" và \"Đáp án\"\n",
+    "questions = re.split(r\"(?=Câu \\d+\\. .*?)\", text)\n",
+    "\n",
+    "# Xử lý từng câu hỏi\n",
+    "data = []\n",
+    "for question in questions:\n",
+    "    if question.strip():  # Bỏ qua các đoạn trống\n",
+    "        match = re.search(r\"(Câu \\d+\\. .*?)\", question, re.DOTALL)\n",
+    "        if match:\n",
+    "            full_question = match.group(1).strip()\n",
+    "            correct_answer = match.group(2).strip()\n",
+    "            data.append({\"Question\": full_question})\n",
+    "\n",
+    "# Lưu dữ liệu vào file CSV\n",
+    "import csv\n",
+    "with open(\"test-data\\Phap_Luat_Trac_Nghiem.csv\", \"w\", newline=\"\", encoding=\"utf-8\") as file:\n",
+    "    writer = csv.DictWriter(file, fieldnames=[\"Question\"])\n",
+    "    writer.writeheader()\n",
+    "    for row in data:\n",
+    "        writer.writerow(row)\n",
+    "print(\"Dữ liệu đã được lưu vào file CSV.\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 134,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "<>:7: SyntaxWarning: invalid escape sequence '\\T'\n",
+      "<>:7: SyntaxWarning: invalid escape sequence '\\T'\n",
+      "C:\\Users\\APTS\\AppData\\Local\\Temp\\ipykernel_17312\\4145004070.py:7: SyntaxWarning: invalid escape sequence '\\T'\n",
+      "  output_csv = \"test-data\\Tu_Tuong_HCM_Trac_Nghiem.csv\"  # File xuất CSV\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Dữ liệu đã được tách và lưu vào test-data\\Tu_Tuong_HCM_Trac_Nghiem.csv.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from docx import Document\n",
+    "import csv\n",
+    "import re\n",
+    "\n",
+    "# Đường dẫn file\n",
+    "input_docx = \"D:/tthcm.docx\"  # Thay bằng đường dẫn thực tế\n",
+    "output_csv = \"test-data\\Tu_Tuong_HCM_Trac_Nghiem.csv\"  # File xuất CSV\n",
+    "\n",
+    "# Mở file DOCX\n",
+    "document = Document(input_docx)\n",
+    "\n",
+    "# Biến lưu trữ\n",
+    "data = []\n",
+    "current_question = \"\"\n",
+    "answers = []\n",
+    "\n",
+    "# Hàm kiểm tra xem đoạn văn có thuộc danh sách không\n",
+    "def is_list_item(paragraph):\n",
+    "    return paragraph.style.name.startswith(\"List\")  # Kiểm tra nếu paragraph thuộc danh sách\n",
+    "\n",
+    "# Đọc từng đoạn văn trong file DOCX\n",
+    "for paragraph in document.paragraphs:\n",
+    "    text = paragraph.text.strip()\n",
+    "\n",
+    "    if is_list_item(paragraph):\n",
+    "        # Kiểm tra nếu là đáp án (bắt đầu bằng a), b), ...)\n",
+    "        if re.match(r\"^[a-d]\\)\\s\", text):\n",
+    "            answers.append(text)\n",
+    "        elif text:  # Coi là câu hỏi nếu không khớp với định dạng đáp án\n",
+    "            # Lưu câu hỏi trước đó (nếu có)\n",
+    "            if current_question and answers:\n",
+    "                correct_answer = next((a.replace(\"(đ)\", \"\").strip() for a in answers if \"(đ)\" in a), \"\")\n",
+    "                data.append([current_question.strip(), correct_answer])\n",
+    "                answers = []  # Reset danh sách đáp án\n",
+    "            current_question = text  # Cập nhật câu hỏi mới\n",
+    "    elif text:  # Đoạn văn không thuộc danh sách (xử lý câu hỏi độc lập)\n",
+    "        if current_question and answers:\n",
+    "            correct_answer = next((a.replace(\"(đ)\", \"\").strip() for a in answers if \"(đ)\" in a), \"\")\n",
+    "            data.append([current_question.strip(), correct_answer])\n",
+    "            current_question = \"\"\n",
+    "            answers = []\n",
+    "\n",
+    "# Xử lý câu hỏi cuối cùng\n",
+    "if current_question and answers:\n",
+    "    correct_answer = next((a.replace(\"(đ)\", \"\").strip() for a in answers if \"(đ)\" in a), \"\")\n",
+    "    data.append([current_question.strip(), correct_answer])\n",
+    "\n",
+    "# Ghi dữ liệu vào file CSV\n",
+    "with open(output_csv, mode=\"w\", encoding=\"utf-8\", newline=\"\") as file:\n",
+    "    writer = csv.writer(file)\n",
+    "    writer.writerow([\"Question\", \"Answer\"])  # Tiêu đề cột\n",
+    "    writer.writerows(data)\n",
+    "\n",
+    "print(f\"Dữ liệu đã được tách và lưu vào {output_csv}.\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 131,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[]"
+      ]
+     },
+     "execution_count": 131,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "answers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.4"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}