From d92442ce28a6cea7542db4d7b21be77a3e4f50b2 Mon Sep 17 00:00:00 2001 From: Isaac Rudnick <48895941+IsaacFigNewton@users.noreply.github.com> Date: Sat, 17 Jan 2026 20:55:43 -0800 Subject: [PATCH 1/3] Update README, requirements, and add .gitignore Added a .gitignore to exclude notebooks/.env. Updated the README with step-by-step instructions for using the QA pipeline with custom files. Modified requirements.txt to allow newer versions of typer and commented out uvloop for compatibility. --- .gitignore | 1 + README.md | 9 ++++++++- requirements.txt | 4 ++-- 3 files changed, 11 insertions(+), 3 deletions(-) create mode 100644 .gitignore diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..0b57c17 --- /dev/null +++ b/.gitignore @@ -0,0 +1 @@ +notebooks/.env \ No newline at end of file diff --git a/README.md b/README.md index 8116d7d..95ffbae 100644 --- a/README.md +++ b/README.md @@ -5,4 +5,11 @@ The following repo contains the materials for my talk delivered at GOTO Amsterda The repo contains the following: * `/notebooks/rag-pdf-qa.ipynb` contains the code for the simple RAG pipeline I demoed during the talk. There are extensive notes in Markdown in this notebook to help you understand how to adapt this for your own use case. * `talk-materials/talk-sources.md` contains all of the papers and other sources I used for this talk. It also contains all of my image credits. -* `talk-materials/beyond-the-hype.pdf` contains a copy of my slides. \ No newline at end of file +* `talk-materials/beyond-the-hype.pdf` contains a copy of my slides. + +# Steps for using the QA pipeline with your own files +1. Create a copy of `.env_sample` and rename it to `.env` +2. Get an OpenAI key from [here](https://platform.openai.com/api-keys) +3. Replace `key-here` with your API key in `.env` and save the file +4. Start a terminal window in the root of this repository +5. Enter `pip install -r requirements.txt` \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index a7d173f..7f635bc 100644 --- a/requirements.txt +++ b/requirements.txt @@ -192,7 +192,7 @@ tornado==6.4 tqdm==4.66.2 traitlets==5.14.2 transformers==4.39.1 -typer==0.10.0 +typer>=0.10.0 types-python-dateutil==2.9.0.20240316 typing-inspect==0.9.0 typing_extensions==4.10.0 @@ -201,7 +201,7 @@ ujson==5.10.0 uri-template==1.3.0 urllib3==2.2.1 uvicorn==0.29.0 -uvloop==0.19.0 +# uvloop==0.19.0 watchfiles==0.21.0 wcwidth==0.2.13 webcolors==1.13 From 8e17fc4016340d7c54e2b9b612736d95a52ff59c Mon Sep 17 00:00:00 2001 From: Isaac Rudnick <48895941+IsaacFigNewton@users.noreply.github.com> Date: Sat, 17 Jan 2026 21:20:33 -0800 Subject: [PATCH 2/3] Improve PDF loading and update dependencies for RAG notebook Updated the PdfQA class in the RAG notebook to use 'plain' extraction mode for PDF loading with a fallback for font parsing issues. Added 'langchain-chroma' to requirements.txt --- notebooks/rag-pdf-qa.ipynb | 276 +++++++++++++++++++++---------------- requirements.txt | 1 + 2 files changed, 159 insertions(+), 118 deletions(-) diff --git a/notebooks/rag-pdf-qa.ipynb b/notebooks/rag-pdf-qa.ipynb index e7996e9..819d4c9 100644 --- a/notebooks/rag-pdf-qa.ipynb +++ b/notebooks/rag-pdf-qa.ipynb @@ -1,8 +1,9 @@ { "cells": [ { - "metadata": {}, "cell_type": "markdown", + "id": "19e695dc29bb9a3a", + "metadata": {}, "source": [ "# Simple RAG pipeline allowing you to \"talk\" to your documentation\n", "\n", @@ -13,17 +14,19 @@ "* Load in our PDF that we want to \"chat\" to;\n", "* We can't pass the whole PDF into a model at the same time (it's almost 2000 pages!). As such, we need to split it into chunks;\n", "* Rather than needing to pass every individual chunk through the LLM to find the information in the document relevant to a question, we can convert these chunks into document embeddings, which we then store in a vector database. At query time, the question is also converted into a document embedding, and the most similar document chunks to the question are retrieved." - ], - "id": "19e695dc29bb9a3a" + ] }, { + "cell_type": "code", + "execution_count": 1, + "id": "a98011df002e3ac7", "metadata": { "ExecuteTime": { "end_time": "2024-12-06T11:45:06.805519Z", "start_time": "2024-12-06T11:45:04.741903Z" } }, - "cell_type": "code", + "outputs": [], "source": [ "from dotenv import load_dotenv\n", "\n", @@ -35,34 +38,28 @@ "\n", "import re\n", "import PyPDF2" - ], - "id": "a98011df002e3ac7", - "outputs": [], - "execution_count": 1 + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "ac5853364f53f165", + "metadata": {}, "source": [ "## Count the number of pages in the PDF\n", "\n", "As you can see, we have a lot of documentation to sort through here!" - ], - "id": "ac5853364f53f165" + ] }, { + "cell_type": "code", + "execution_count": 2, + "id": "1ad45b37d3f23ea5", "metadata": { "ExecuteTime": { "end_time": "2024-12-06T11:45:40.679685Z", "start_time": "2024-12-06T11:45:40.430619Z" } }, - "cell_type": "code", - "source": [ - "pdf = PyPDF2.PdfReader(open(\"../materials/pycharm-documentation.pdf\", \"rb\"))\n", - "len(pdf.pages)" - ], - "id": "1ad45b37d3f23ea5", "outputs": [ { "data": { @@ -75,16 +72,22 @@ "output_type": "execute_result" } ], - "execution_count": 2 + "source": [ + "pdf = PyPDF2.PdfReader(open(\"../materials/pycharm-documentation.pdf\", \"rb\"))\n", + "len(pdf.pages)" + ] }, { + "cell_type": "code", + "execution_count": 3, + "id": "a6c1a3bcaaee5c40", "metadata": { "ExecuteTime": { "end_time": "2024-12-06T11:45:41.994376Z", "start_time": "2024-12-06T11:45:41.988630Z" } }, - "cell_type": "code", + "outputs": [], "source": [ "class PdfQA:\n", " \"\"\"\n", @@ -101,7 +104,7 @@ "\n", " def __init__(self, model, pdf_document, chunk_size, chunk_overlap,\n", " search_type, n_documents, chain_type):\n", - " load_dotenv()\n", + " load_dotenv('.env')\n", " self.init_chat_model(model)\n", " self.load_documents(pdf_document)\n", " self.split_documents(chunk_size, chunk_overlap)\n", @@ -130,8 +133,14 @@ " :return: None\n", " \"\"\"\n", " print(\"Loading PDFs\")\n", - " pdf_loader = PyPDFLoader(pdf_document)\n", - " self.documents = pdf_loader.load()\n", + " # Try with plain extraction mode to avoid font parsing issues\n", + " try:\n", + " pdf_loader = PyPDFLoader(pdf_document, extraction_mode=\"plain\")\n", + " self.documents = pdf_loader.load()\n", + " except:\n", + " # Fallback to default mode if plain mode is not available\n", + " pdf_loader = PyPDFLoader(pdf_document)\n", + " self.documents = pdf_loader.load()\n", "\n", " def split_documents(self, chunk_size, chunk_overlap):\n", " \"\"\"\n", @@ -187,14 +196,12 @@ " :return: The chain of the object.\n", " \"\"\"\n", " return self.chain" - ], - "id": "a6c1a3bcaaee5c40", - "outputs": [], - "execution_count": 3 + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "337a752314ad724", + "metadata": {}, "source": [ "## Levers in the RAG pipeline\n", "RAG is quite tricky to get right, especially if you need it to be efficient. There are many levers we can pull in our pipeline, which influence the following things:\n", @@ -212,126 +219,144 @@ "* `chain_type`: this controls how the content is passed into the LLM. In the case of \"stuff\" it passes all gathered context chunks into the context window at once. Other options are \"refine\", which feeds in the chunks in batches, plus the answer generated so far, and \"map-rerank\", which feeds in each chunk and assigns a score based on how well it answered the question.\n", "\n", "Other levers I've chosen not to make arguments in this class are the model used for embeddings (the `OpenAIEmbeddings` were used) and which vector database we use to store the document embeddings (in this case, the `Chroma` vector store was used)." - ], - "id": "337a752314ad724" + ] }, { + "cell_type": "code", + "execution_count": 4, + "id": "d289d321d47cd47f", "metadata": { "ExecuteTime": { "end_time": "2024-12-06T11:35:12.839701Z", "start_time": "2024-12-06T11:35:12.836663Z" } }, - "cell_type": "code", - "source": "load_dotenv()", - "id": "d289d321d47cd47f", "outputs": [ { "data": { "text/plain": [ - "False" + "True" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 5 + "source": [ + "load_dotenv('.env')" + ] }, { + "cell_type": "code", + "execution_count": 5, + "id": "696e06baa6306eeb", "metadata": { "ExecuteTime": { "end_time": "2024-12-06T11:46:19.300978Z", "start_time": "2024-12-06T11:45:46.942267Z" } }, - "cell_type": "code", - "source": [ - "pdf_qa = PdfQA(\"gpt-3.5-turbo\", \"../materials/pycharm-documentation.pdf\", 1000, 0, \"similarity\", \n", - " 5, \"stuff\")\n", - "pdf_qa_chain = pdf_qa.query_chain()" - ], - "id": "696e06baa6306eeb", "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loading model\n", - "Loading PDFs\n", + "Loading PDFs\n" + ] + }, + { + "name": "stderr", + "output_type": "stream", + "text": [ + "C:\\Users\\igeek\\AppData\\Roaming\\Python\\Python312\\site-packages\\pypdf\\_crypt_providers\\_cryptography.py:32: CryptographyDeprecationWarning: ARC4 has been moved to cryptography.hazmat.decrepit.ciphers.algorithms.ARC4 and will be removed from cryptography.hazmat.primitives.ciphers.algorithms in 48.0.0.\n", + " from cryptography.hazmat.primitives.ciphers.algorithms import AES, ARC4\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ "Splitting documents\n", "Creating document embeddings\n", "Generating chunk retriever\n" ] } ], - "execution_count": 4 + "source": [ + "pdf_qa = PdfQA(\"gpt-3.5-turbo\", \"../materials/pycharm-documentation.pdf\", 1000, 0, \"similarity\", \n", + " 5, \"stuff\")\n", + "pdf_qa_chain = pdf_qa.query_chain()" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "Let's try it out by asking how we can debug in PyCharm.", - "id": "e1618a7539d451a0" + "id": "e1618a7539d451a0", + "metadata": {}, + "source": [ + "Let's try it out by asking how we can debug in PyCharm." + ] }, { + "cell_type": "code", + "execution_count": 6, + "id": "ec7e8f62e1ff0803", "metadata": { "ExecuteTime": { "end_time": "2024-12-06T11:46:33.578117Z", "start_time": "2024-12-06T11:46:31.349948Z" } }, - "cell_type": "code", - "source": "answer1 = pdf_qa_chain.invoke({\"query\": \"What are the options for debugging with PyCharm?\"})", - "id": "ec7e8f62e1ff0803", "outputs": [], - "execution_count": 5 + "source": [ + "answer1 = pdf_qa_chain.invoke({\"query\": \"What are the options for debugging with PyCharm?\"})" + ] }, { + "cell_type": "code", + "execution_count": 7, + "id": "6d1fbc81d231df98", "metadata": { "ExecuteTime": { "end_time": "2024-12-06T11:46:35.143288Z", "start_time": "2024-12-06T11:46:35.140068Z" } }, - "cell_type": "code", - "source": "answer1[\"result\"]", - "id": "6d1fbc81d231df98", "outputs": [ { "data": { "text/plain": [ - "'The options for debugging with PyCharm include placing breakpoints at specific lines of code where program execution will be suspended, stepping through the code line by line, evaluating expressions, adding watches, and manually setting variable values. You can start debugging by pressing a specific key, and then navigate through the program execution using the available options in the Run menu or the Debug tool window. Additionally, PyCharm provides a Debug tool window with dedicated panes for frames, variables, watches, and a Console tab for input and output information.'" + "'The options for debugging with PyCharm include placing breakpoints at specific lines of code where program execution will be suspended, stepping through the code line by line, evaluating arbitrary expressions, adding watches to monitor variables, and manually setting values for variables. Additionally, PyCharm allows you to start debugging sessions by clicking on the gutter and selecting the Debug command, as well as providing a Debug tool window with dedicated panes for frames, variables, watches, and a Console tab for input and output information.'" ] }, - "execution_count": 6, + "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 6 + "source": [ + "answer1[\"result\"]" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "We can see the answer is very comprehensive. Let's have a look at the information it was based on from the documentation.", - "id": "2d0f4f9b2ba9b26" + "id": "2d0f4f9b2ba9b26", + "metadata": {}, + "source": [ + "We can see the answer is very comprehensive. Let's have a look at the information it was based on from the documentation." + ] }, { + "cell_type": "code", + "execution_count": 8, + "id": "f1453f0dff1fe05d", "metadata": { "ExecuteTime": { "end_time": "2024-12-06T11:46:40.067281Z", "start_time": "2024-12-06T11:46:40.065048Z" } }, - "cell_type": "code", - "source": [ - "for document in answer1[\"source_documents\"]:\n", - " index_n = answer1[\"source_documents\"].index(document)\n", - " print(f\"\\nDOCUMENT {index_n + 1}\")\n", - " print(re.sub(r\"\\s+\", \" \", document.page_content.strip()))" - ], - "id": "f1453f0dff1fe05d", "outputs": [ { "name": "stdout", @@ -339,7 +364,7 @@ "text": [ "\n", "DOCUMENT 1\n", - "Debug Does your application stumble on a runtime error? To find out what’s causing it, you will have to do some debugging. PyCharm supports the debugger on all platforms. Debugging starts with placing breakpoints at which program execution will be suspended, so you can explore program data. Just click the gutter of the line where you want the breakpoint to appear. To start debugging your application, press . Then go through the program execution step by step (see the available options in the Run menu or the Debug tool window), evaluate any arbitrary expression, add watches, and manually set values for the variables. For more information, refer to Debugging. Test It is a good idea to test your applications, and PyCharm helps doing it as simple as possible. With PyCharm, you can: ⌃Ctrl D Create tests• Create special testing run/debug configurations.• Run and debug tests right from the IDE, using the testing run/debug configurations.•\n", + "Debug Does your application stumble on a runtime error? To find out what’s causing it, you will have to do some debugging. PyCharm supports the debugger on all platforms. Debugging starts with placing breakpoints at which program execution will be suspended, so you can explore program data. Just click the gutter of the line where you want the breakpoint to appear. To start debugging your application, press . Then go through the program execution step by step (see the available options in the Run menu or the Debug tool window), evaluate any arbitrary expression, add watches, and manually set values for the variables. For more information, refer to Debugging. Test It is a good idea to test your applications, and PyCharm helps doing it as simple as possible. With PyCharm, you can: ⌃Ctrl D Create tests • Create special testing run/debug configurations. • Run and debug tests right from the IDE, using the testing run/debug configurations. •\n", "\n", "DOCUMENT 2\n", "For more information, refer to Breakpoints. Starting the debugger session OK now, as we've added breakpoints, everything is ready for debugging. PyCharm allows starting the debugger session in several ways. Let's choose one: click in the gutter, and then select the command Debug 'solver' in the popup menu that opens:\n", @@ -351,103 +376,111 @@ "Debugging in detail The Debug tool window consists of dedicated panes for frames, variables, and watches, as well as the Console tab, where all the input and output information is displayed. If you want the console to be always visible, you can drag it to one of the PyCharm window's edges. Stepping If you want to see what your code does line by line, there's no need to put a breakpoint on every line, you can step through your code. Let's see what it looks like to step through our example program. Start or restart the debugger by using the Run widget at the top of the window:\n", "\n", "DOCUMENT 5\n", - "Python support in PyCharm PyCharm provides the following features to help you work with Python: Feature PyCharm Community PyCharm Professional Dedicated project types Ability to configure local interpreters and virtual environments. Ability to configure remote and docker-based interpreters. Python console. Run/debug configurations for Python Run/debug configurations for Python remote debug Code insight, Code inspections, Intention actions, and Code completion Built-in code formatter and separate set of Python code style settings Limited to Python, HTML, JSON, XML, and YAML Find usages in Python code.\n" + "Python support in PyCharm PyCharm provides the following features to help you work with Python: Feature PyCharm Community PyCharm Professional Dedicated project types Ability to configure local interpreters and virtual environments. Ability to configure remote and docker-based interpreters. Python console. Run/debug configurations for Python Run/debug configurations for Python remote debug Code insight, Code inspections, Intention actions, and Code completion Built-in code formatter and separate set of Python code style settingsLimited to Python, HTML, JSON, XML, and YAML Find usages in Python code.\n" ] } ], - "execution_count": 7 + "source": [ + "for document in answer1[\"source_documents\"]:\n", + " index_n = answer1[\"source_documents\"].index(document)\n", + " print(f\"\\nDOCUMENT {index_n + 1}\")\n", + " print(re.sub(r\"\\s+\", \" \", document.page_content.strip()))" + ] }, { - "metadata": {}, "cell_type": "markdown", + "id": "988b16f0387ce399", + "metadata": {}, "source": [ "We can see that the first three chunks are the most relevant, while the last three don't really add that much to the answer.\n", "\n", "If we'd like, we can go a bit deeper with our answer. We can set up a memory for the last answer the LLM gave us so we can ask follow up questions. In this case, let's see if the LLM left out anything about PyCharm's debugging." - ], - "id": "988b16f0387ce399" + ] }, { + "cell_type": "code", + "execution_count": 9, + "id": "233bb88309ba48d7", "metadata": { "ExecuteTime": { "end_time": "2024-12-06T11:46:44.813876Z", "start_time": "2024-12-06T11:46:43.887673Z" } }, - "cell_type": "code", + "outputs": [], "source": [ "chat_history1 = [(answer1[\"query\"], answer1[\"result\"])]\n", "answer2 = pdf_qa_chain.invoke({\"query\": \"Have you left out any other types of debugging?\",\n", " \"chat_history\": chat_history1})" - ], - "id": "233bb88309ba48d7", - "outputs": [], - "execution_count": 8 + ] }, { + "cell_type": "code", + "execution_count": 10, + "id": "3d5d632b16484d7a", "metadata": { "ExecuteTime": { "end_time": "2024-12-06T11:46:46.406735Z", "start_time": "2024-12-06T11:46:46.404038Z" } }, - "cell_type": "code", - "source": "answer2[\"result\"]", - "id": "3d5d632b16484d7a", "outputs": [ { "data": { "text/plain": [ - "'Yes, the provided context does not mention any other types of debugging besides stepping through the code, working with variables, and using the console for input and output information.'" + "'Yes, there are other types of debugging mentioned in the context provided. Some of them include:\\n- Debugging in detail with dedicated panes for frames, variables, and watches in the Debug tool window.\\n- Stepping through code to see what it does line by line without putting breakpoints on every line.\\n- Enabling inline debugging values in the debugger settings.\\n- Debugging JavaScript for Professional edition users.\\n- Debugging Django templates by adding breakpoints and stepping through the template.\\n- Working in the Threads and Variables tab to observe variables used in the application.\\n- Working in the Console tab to see error messages or perform calculations not related to the current application.\\n\\nThese are some additional types of debugging mentioned in the context.'" ] }, - "execution_count": 9, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 9 + "source": [ + "answer2[\"result\"]" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "If our model is capable of it, we can even enter queries in a different language to the source documentation, and get relevant answers back in this language. Here we question our English-language documentation in German ...", - "id": "c6922b4d1d9ba8fa" + "id": "c6922b4d1d9ba8fa", + "metadata": {}, + "source": [ + "If our model is capable of it, we can even enter queries in a different language to the source documentation, and get relevant answers back in this language. Here we question our English-language documentation in German ..." + ] }, { + "cell_type": "code", + "execution_count": 11, + "id": "977361cabc240a1a", "metadata": { "ExecuteTime": { "end_time": "2024-12-06T11:46:59.444462Z", "start_time": "2024-12-06T11:46:56.765511Z" } }, - "cell_type": "code", - "source": "answer3 = pdf_qa_chain.invoke({\"query\": \"Wie kann man PyCharm installieren?\"})", - "id": "977361cabc240a1a", "outputs": [], - "execution_count": 10 + "source": [ + "answer3 = pdf_qa_chain.invoke({\"query\": \"Wie kann man PyCharm installieren?\"})" + ] }, { - "metadata": {}, "cell_type": "markdown", - "source": "... and get a relevant answer in German!", - "id": "775a8861805a7351" + "id": "775a8861805a7351", + "metadata": {}, + "source": [ + "... and get a relevant answer in German!" + ] }, { + "cell_type": "code", + "execution_count": 12, + "id": "7bc9df603729ace6", "metadata": { "ExecuteTime": { "end_time": "2024-12-06T11:47:00.951035Z", "start_time": "2024-12-06T11:47:00.948824Z" } }, - "cell_type": "code", - "source": [ - "for document in answer3[\"source_documents\"]:\n", - " index_n = answer3[\"source_documents\"].index(document)\n", - " print(f\"\\nDOCUMENT {index_n + 1}\")\n", - " print(re.sub(r\"\\s+\", \" \", document.page_content.strip()))" - ], - "id": "7bc9df603729ace6", "outputs": [ { "name": "stdout", @@ -455,47 +488,54 @@ "text": [ "\n", "DOCUMENT 1\n", - "PyCharm 2024.1 Getting started/Installation guide Last modified: 06 May 2024 PyCharm is a cross-platform IDE that provides consistent experience on the Windows, macOS, and Linux operating systems. PyCharm is available in two editions: Professional, and Community. The Community edition is an open-source project, and it's free, but it has fewer features. The Professional edition is commercial, and provides an outstanding set of tools and features. For more information, refer to the editions comparison matrix↗ . Install PyCharm\n", + "PyCharm 2024.1 Getting started/Installation guide Last modified: 06 May 2024 PyCharm is a cross-platform IDE that provides consistent experience on the Windows, macOS, and Linux operating systems. PyCharm is available in two editions: Professional, and Community. The Community edition is an open-source project, and it's free, but it has fewer features. The Professional edition is commercial, and provides an outstanding set of tools and features. For more information, refer to the editions comparison matrix ↗.Install PyCharm\n", "\n", "DOCUMENT 2\n", - "To run PyCharm, find it in the Windows Start menu or use the desktop shortcut. You can also run the launcher batch script or executable in the installation directory under bin. When you run PyCharm for the first time, you can take several steps to complete the installation, customize your instance, and start working with the IDE. For more information, refer to Run PyCharm for the first time. For more information about the location of the default IDE directories with user- specific files, refer to Directories used by the IDE. Silent installation on Windows Silent installation is performed without any user interface. It can be used by network administrators to install PyCharm on a number of machines and avoid interrupting other users. To perform silent install, run the installer with the following switches: There is a separate installer for ARM64 processors. To verify the integrity of the installer, use the SHA checksum linked from the Download↗ page. Run the installer and follow the wizard steps. Mind the following options in the installation wizard 2. 64-bit launcher: Adds a launching icon to the Desktop.• Open Folder as Project: Adds an option to the folder context menu that will allow opening the selected directory as a PyCharm project. • .py: Establishes an association with Python files to open them in PyCharm.• Add launchers dir to the PATH: Allows running this PyCharm instance from the Console without specifying the path to it. •\n", + "You can install PyCharm using Toolbox or standalone installations. If you need assistance installing PyCharm, see the installation instructions: Install PyCharmRequirement Minimum Recommended Operating systemOfficially released versions of the following: Pre-release versions are not supported.The latest versions of the following: Microsoft Windows 10 1809 64-bit or later Windows Server 2019 64- bit or later• macOS 12.0 or later • Ubuntu Linux 20.04 LTS or a later LTS version that uses the following:• Gnome or KDE • X Window System (X11) Wayland support is in development. You can monitor the progress and leave your feedback in JBR-3206: Native Wayland support ↗.• GLIBC ↗ 2.29 or later •Windows 64-bit • macOS • Ubuntu Linux LTS •\n", "\n", "DOCUMENT 3\n", - "You can install PyCharm using Toolbox or standalone installations. If you need assistance installing PyCharm, see the installation instructions: Install PyCharm Requirement Minimum Recommended Operating system Officially released versions of the following: Pre-release versions are not supported. The latest versions of the following: Microsoft Windows 10 1809 64-bit or later Windows Server 2019 64- bit or later • macOS 12.0 or later• Ubuntu Linux 20.04 LTS or a later LTS version that uses the following: • Gnome or KDE• X Window System (X11) Wayland support is in development. You can monitor the progress and leave your feedback in JBR-3206: Native Wayland support↗ . • GLIBC↗ 2.29 or later• Windows 64-bit• macOS• Ubuntu Linux LTS•\n", + "To run PyCharm, find it in the Windows Start menu or use the desktop shortcut. You can also run the launcher batch script or executable in the installation directory under bin. When you run PyCharm for the first time, you can take several steps to complete the installation, customize your instance, and start working with the IDE. For more information, refer to Run PyCharm for the first time. For more information about the location of the default IDE directories with user- specific files, refer to Directories used by the IDE. Silent installation on Windows Silent installation is performed without any user interface. It can be used by network administrators to install PyCharm on a number of machines and avoid interrupting other users. To perform silent install, run the installer with the following switches:There is a separate installer for ARM64 processors. To verify the integrity of the installer, use the SHA checksum linked from the Download ↗ page. Run the installer and follow the wizard steps. Mind the following options in the installation wizard2. 64-bit launcher: Adds a launching icon to the Desktop. • Open Folder as Project: Adds an option to the folder context menu that will allow opening the selected directory as a PyCharm project.• .py: Establishes an association with Python files to open them in PyCharm. • Add launchers dir to the PATH: Allows running this PyCharm instance from the Console without specifying the path to it.•\n", "\n", "DOCUMENT 4\n", - "Log in to your JetBrains Account from the Toolbox App, and it will automatically activate the available licenses for any IDE that you install. If you installed PyCharm via the Toolbox App↗ , you can find the installation directory in the app: open the settings of the IDE instance in the Toolbox App, expand Configuration, and look for the Install location field. Standalone installation Install PyCharm manually to manage the location of every instance and all the configuration files. For example, if you have a policy that requires specific install locations. macOS Linux Download the installer↗ .exe.1. Windows\n", + "PyCharm 2024.1 Getting started/Installation guide/Run PyCharm for the first time Last modified: 15 May 2024 You can use the Toolbox App to run any JetBrains product. In the case of a standalone installation, running PyCharm depends on the operating system: To run PyCharm, find it in the Windows Start menu or use the desktop shortcut. You can also run the launcher batch script or executable in the installation directory under bin. For more information about running PyCharm from the command line, refer to Command-line interface. You will see the Welcome screen, the starting point to your work with the IDE. This screen also appears when you close all opened projects. Use the tabs on the left side to switch to the specific welcome dialog.Run PyCharm for the first time macOS Linux Windows\n", "\n", "DOCUMENT 5\n", - "PyCharm 2024.1 Getting started/Installation guide/Run PyCharm for the first time Last modified: 15 May 2024 You can use the Toolbox App to run any JetBrains product. In the case of a standalone installation, running PyCharm depends on the operating system: To run PyCharm, find it in the Windows Start menu or use the desktop shortcut. You can also run the launcher batch script or executable in the installation directory under bin. For more information about running PyCharm from the command line, refer to Command-line interface. You will see the Welcome screen, the starting point to your work with the IDE. This screen also appears when you close all opened projects. Use the tabs on the left side to switch to the specific welcome dialog. Run PyCharm for the first time macOS LinuxWindows\n" + "PyCharm supports the following versions of Python: Install using the Toolbox App The JetBrains Toolbox App ↗ is the recommended tool to install JetBrains products. Use it to install and manage different products or several versions of the same product, including Early Access Program ↗ (EAP) and Nightly releases, update and roll back when necessary, and easily remove any tool. The Toolbox App maintains a list of all your projects to quickly open any project in the right IDE and version. Install the Toolbox AppPython 2: version 2.7 • Python 3: from the version 3.6 up to the version 3.12 • macOS Linux Download the installer .exe from the Toolbox App web page ↗. 1. Run the installer and follow the wizard steps. 2. After you run the Toolbox App, click its icon in the notification area and select which product you want to install. To install a specific version, click and select Available versions.3.Windows\n" ] } ], - "execution_count": 11 + "source": [ + "for document in answer3[\"source_documents\"]:\n", + " index_n = answer3[\"source_documents\"].index(document)\n", + " print(f\"\\nDOCUMENT {index_n + 1}\")\n", + " print(re.sub(r\"\\s+\", \" \", document.page_content.strip()))" + ] }, { + "cell_type": "code", + "execution_count": 13, + "id": "24653b6650739585", "metadata": { "ExecuteTime": { "end_time": "2024-12-06T11:47:07.926430Z", "start_time": "2024-12-06T11:47:07.924180Z" } }, - "cell_type": "code", - "source": "answer3[\"result\"]", - "id": "24653b6650739585", "outputs": [ { "data": { "text/plain": [ - "'Um PyCharm zu installieren, gibt es mehrere Möglichkeiten:\\n\\n1. **Verwendung des Toolbox-Apps**: Wenn du das Toolbox-App verwendest, kannst du PyCharm darüber installieren. Logge dich in deinem JetBrains-Konto ein und das Toolbox-App wird automatisch verfügbare Lizenzen aktivieren.\\n\\n2. **Standalone-Installation**: Du kannst PyCharm auch manuell installieren, um den Speicherort jeder Instanz und aller Konfigurationsdateien zu verwalten. Dies ist nützlich, wenn du spezifische Installationsorte benötigst.\\n\\n3. **Stille Installation auf Windows**: Wenn du eine stille Installation ohne Benutzeroberfläche durchführen möchtest, kannst du den Installer mit bestimmten Schaltern ausführen. Dies ist nützlich für Netzwerkadministratoren, um PyCharm auf mehreren Maschinen zu installieren, ohne andere Benutzer zu unterbrechen.\\n\\nJe nachdem, welche Methode du bevorzugst, kannst du PyCharm entweder über das Toolbox-App installieren oder die Standalone-Installation durchführen.'" + "'Sie können PyCharm mithilfe des Toolbox oder als eigenständige Installation installieren. Wenn Sie Hilfe bei der Installation von PyCharm benötigen, finden Sie die Installationsanweisungen auf der offiziellen Website von PyCharm. Es gibt auch eine stille Installationsoption für Netzwerkadministratoren, um PyCharm auf mehreren Maschinen zu installieren, ohne andere Benutzer zu unterbrechen. Stellen Sie sicher, dass Ihr Betriebssystem die Mindestanforderungen erfüllt, und führen Sie den Installationsprozess gemäß den Anweisungen aus.'" ] }, - "execution_count": 12, + "execution_count": 13, "metadata": {}, "output_type": "execute_result" } ], - "execution_count": 12 + "source": [ + "answer3[\"result\"]" + ] } ], "metadata": { @@ -507,14 +547,14 @@ "language_info": { "codemirror_mode": { "name": "ipython", - "version": 2 + "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython2", - "version": "2.7.6" + "pygments_lexer": "ipython3", + "version": "3.12.6" } }, "nbformat": 4, diff --git a/requirements.txt b/requirements.txt index 7f635bc..63e25a1 100644 --- a/requirements.txt +++ b/requirements.txt @@ -84,6 +84,7 @@ jupyterlab_server==2.25.4 jupyterlab_widgets==3.0.10 kubernetes==29.0.0 langchain==0.1.13 +langchain-chroma==0.2.6 langchain-community==0.0.29 langchain-core==0.1.33 langchain-openai==0.1.1 From 392194bda65135730199859c9ed075f51a0678d2 Mon Sep 17 00:00:00 2001 From: Isaac Rudnick <48895941+IsaacFigNewton@users.noreply.github.com> Date: Sat, 17 Jan 2026 21:34:25 -0800 Subject: [PATCH 3/3] added .env_sample and updated README.md --- README.md | 5 +++-- notebooks/.env_sample | 1 + 2 files changed, 4 insertions(+), 2 deletions(-) create mode 100644 notebooks/.env_sample diff --git a/README.md b/README.md index 95ffbae..6a96b38 100644 --- a/README.md +++ b/README.md @@ -7,9 +7,10 @@ The repo contains the following: * `talk-materials/talk-sources.md` contains all of the papers and other sources I used for this talk. It also contains all of my image credits. * `talk-materials/beyond-the-hype.pdf` contains a copy of my slides. -# Steps for using the QA pipeline with your own files +# Steps for using the QA pipeline 1. Create a copy of `.env_sample` and rename it to `.env` 2. Get an OpenAI key from [here](https://platform.openai.com/api-keys) 3. Replace `key-here` with your API key in `.env` and save the file 4. Start a terminal window in the root of this repository -5. Enter `pip install -r requirements.txt` \ No newline at end of file +5. Enter `pip install -r requirements.txt` +8. Run all cells of the notebook \ No newline at end of file diff --git a/notebooks/.env_sample b/notebooks/.env_sample new file mode 100644 index 0000000..f280d82 --- /dev/null +++ b/notebooks/.env_sample @@ -0,0 +1 @@ +OPENAI_API_KEY=key-here \ No newline at end of file