inference-ai-course · MichaelXian · Nov 13, 2025 · Nov 13, 2025 · Nov 13, 2025
diff --git a/.gitignore b/.gitignore
@@ -205,3 +205,87 @@ cython_debug/
 marimo/_static/
 marimo/_lsp/
 __marimo__/
+
+# Covers JetBrains IDEs: IntelliJ, GoLand, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
+# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
+
+# User-specific stuff
+.idea/**/workspace.xml
+.idea/**/tasks.xml
+.idea/**/usage.statistics.xml
+.idea/**/dictionaries
+.idea/**/shelf
+
+# AWS User-specific
+.idea/**/aws.xml
+
+# Generated files
+.idea/**/contentModel.xml
+
+# Sensitive or high-churn files
+.idea/**/dataSources/
+.idea/**/dataSources.ids
+.idea/**/dataSources.local.xml
+.idea/**/sqlDataSources.xml
+.idea/**/dynamic.xml
+.idea/**/uiDesigner.xml
+.idea/**/dbnavigator.xml
+
+# Gradle
+.idea/**/gradle.xml
+.idea/**/libraries
+
+# Gradle and Maven with auto-import
+# When using Gradle or Maven with auto-import, you should exclude module files,
+# since they will be recreated, and may cause churn.  Uncomment if using
+# auto-import.
+# .idea/artifacts
+# .idea/compiler.xml
+# .idea/jarRepositories.xml
+# .idea/modules.xml
+# .idea/*.iml
+# .idea/modules
+# *.iml
+# *.ipr
+
+# CMake
+cmake-build-*/
+
+# Mongo Explorer plugin
+.idea/**/mongoSettings.xml
+
+# File-based project format
+*.iws
+
+# IntelliJ
+out/
+
+# mpeltonen/sbt-idea plugin
+.idea_modules/
+
+# JIRA plugin
+atlassian-ide-plugin.xml
+
+# Cursive Clojure plugin
+.idea/replstate.xml
+
+# SonarLint plugin
+.idea/sonarlint/
+.idea/sonarlint.xml # see https://community.sonarsource.com/t/is-the-file-idea-idea-idea-sonarlint-xml-intended-to-be-under-source-control/121119
+
+# Crashlytics plugin (for Android Studio and IntelliJ)
+com_crashlytics_export_strings.xml
+crashlytics.properties
+crashlytics-build.properties
+fabric.properties
+
+# Editor-based HTTP Client
+.idea/httpRequests
+http-client.private.env.json
+
+# Android studio 3.1+ serialized cache file
+.idea/caches/build_file_checksums.ser
+
+# Apifox Helper cache
+.idea/.cache/.Apifox_Helper
+.idea/ApifoxUploaderProjectSetting.xml
diff --git a/.idea/.gitignore b/.idea/.gitignore
diff --git a/.idea/Homework4-Submission.iml b/.idea/Homework4-Submission.iml
diff --git a/.idea/inspectionProfiles/profiles_settings.xml b/.idea/inspectionProfiles/profiles_settings.xml
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/.idea/modules.xml b/.idea/modules.xml
diff --git a/.idea/vcs.xml b/.idea/vcs.xml
diff --git a/Class 4 Homework.ipynb b/Class 4 Homework.ipynb
@@ -80,19 +80,81 @@
    "outputs": [],
    "source": [
     "import fitz  # PyMuPDF\n",
+    "from typing import List\n",
+    "import urllib, urllib.request\n",
+    "import feedparser\n",
+    "import requests\n",
+    "from io import BytesIO\n",
+    "import ssl, certifi, urllib.request\n",
     "\n",
-    "def extract_text_from_pdf(pdf_path: str) -> str:\n",
+    "context = ssl.create_default_context(cafile=certifi.where())\n",
+    "\n",
+    "def ssl_read_url(url: str) -> str:\n",
+    "    return urllib.request.urlopen(url, context=context).read()\n",
+    "\n",
+    "def get_pdf_urls() -> List[str]:\n",
+    "    url = f\"https://export.arxiv.org/api/query?search_query=all:electron&start=0&max_results=50\"\n",
+    "    data = ssl_read_url(url)\n",
+    "    res = feedparser.parse(data)\n",
+    "    return [\n",
+    "            link.href\n",
+    "            for entry in res.entries\n",
+    "            for link in entry.links\n",
+    "            if \"pdf\" in link.href\n",
+    "          ]\n",
+    "\n",
+    "    \n",
+    "def extract_text_from_url(url: str) -> str:\n",
     "    \"\"\"\n",
     "    Open a PDF and extract all text as a single string.\n",
     "    \"\"\"\n",
-    "    doc = fitz.open(pdf_path)\n",
+    "    response = requests.get(url)\n",
+    "\n",
+    "    pdf_bytes = BytesIO(response.content)\n",
+    "    doc = fitz.open(stream=pdf_bytes, filetype=\"pdf\")\n",
     "    pages = []\n",
     "    for page in doc:\n",
     "        page_text = page.get_text()  # get raw text from page\n",
     "        # (Optional) clean page_text here (remove headers/footers)\n",
     "        pages.append(page_text)\n",
     "    full_text = \"\\n\".join(pages)\n",
-    "    return full_text\n"
+    "    return full_text"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "50\n"
+     ]
+    }
+   ],
+   "source": [
+    "urls = get_pdf_urls()\n",
+    "print(len(urls))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "50\n"
+     ]
+    }
+   ],
+   "source": [
+    "full_texts = [extract_text_from_url(url) for url in urls]\n",
+    "print(len(full_texts))"
    ]
   },
   {
@@ -104,7 +166,7 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 36,
    "metadata": {},
    "outputs": [],
    "source": [
@@ -115,7 +177,33 @@
     "    for i in range(0, len(tokens), step):\n",
     "        chunk = tokens[i:i + max_tokens]\n",
     "        chunks.append(\" \".join(chunk))\n",
-    "    return chunks\n"
+    "    return chunks\n",
+    "\n",
+    "chunks = [\n",
+    "        chunk  \n",
+    "        for text in full_texts \n",
+    "        for chunk in chunk_text(text)\n",
+    "]\n",
+    "\n",
+    "with open(\"chunks.json\", \"w\", encoding=\"utf-8\") as f:\n",
+    "    json.dump(chunks, f, ensure_ascii=False, indent=4)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['Wiley-Interscience Publication, p61. [11] E. Benedetto et al., Proceedings of PAC07, p. 4033 (2007).', 'Proceedings of PAC07, p. 4033 (2007).', 'parallel plates separated by 0.2 mm is depicted. The electric ﬁeld, driving the electron, 0 0.05 0.1', 'ctron and Positron Accelerators (MBI97), KEK, Tsukuba, Japan, 15-18 July 1997, KEK Proceedings 97-17', 'build up sim- ulations at CERN, in Proc.of ECLOUD12, La Biodola, Isola d’Elba, Italy, 5-9 June 2012.']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print([chunk[100:] for chunk in chunks[-5:]])"
    ]
   },
   {
@@ -128,19 +216,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 30,
    "metadata": {},
    "outputs": [],
    "source": [
+    "from sentence_transformers import SentenceTransformer\n",
+    "from numpy import ndarray\n",
     "\n",
-    "def chunk_text(text: str, max_tokens: int = 512, overlap: int = 50) -> List[str]:\n",
-    "    tokens = text.split()\n",
-    "    chunks = []\n",
-    "    step = max_tokens - overlap\n",
-    "    for i in range(0, len(tokens), step):\n",
-    "        chunk = tokens[i:i + max_tokens]\n",
-    "        chunks.append(\" \".join(chunk))\n",
-    "    return chunks\n"
+    "model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n",
+    "\n",
+    "def get_embeddings(chunks: List[str]) -> ndarray:\n",
+    "    return model.encode(chunks) \n",
+    "\n",
+    "embeddings = get_embeddings(chunks)"
    ]
   },
   {
@@ -153,11 +241,11 @@
   },
   {
    "cell_type": "code",
-   "execution_count": null,
+   "execution_count": 37,
    "metadata": {},
    "outputs": [],
    "source": [
-    "\n",
+    "import json\n",
     "import faiss\n",
     "import numpy as np\n",
     "\n",
@@ -167,10 +255,31 @@
     "index.add(np.array(embeddings))  # add all chunk vectors\n",
     "\n",
     "# Example: search for a query embedding\n",
-    "query_embedding = ...  # get embedding for the query (shape: [1, dim])\n",
+    "queries = [\"How do electron holes in materials work\",\n",
+    "           \"Explain how spin-background affects low-lying excitations\",\n",
+    "           \"Explain the properties of metallic adatoms\",\n",
+    "           \"Explain hole-spin qubits\",\n",
+    "           \"Explain why it's hard to scale up quantum computers\"\n",
+    "          ]\n",
+    "query_embeddings = model.encode(queries)  # get embedding for the query (shape: [1, dim])\n",
     "k = 3\n",
-    "distances, indices = index.search(query_embedding, k)\n",
-    "# indices[0] holds the top-k chunk indices\n"
+    "distances, indicesList = index.search(query_embeddings, k)\n",
+    "indicesList = indicesList.tolist()\n",
+    "retrieval_report = [\n",
+    "    {\n",
+    "        \"request\": queries[i],\n",
+    "        \"response\": [chunks[index] for index in indices]\n",
+    "    }\n",
+    "    for i, indices in enumerate(indicesList)\n",
+    "]\n",
+    "\n",
+    "import pickle\n",
+    "\n",
+    "with open(\"index.pkl\", \"wb\") as f:\n",
+    "    pickle.dump(index, f)\n",
+    "\n",
+    "with open(\"retrieval_report.json\", \"w\", encoding=\"utf-8\") as f:\n",
+    "    json.dump(retrieval_report, f, ensure_ascii=False, indent=4)"
    ]
   },
   {
@@ -233,10 +342,24 @@
   }
  ],
  "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
   "language_info": {
-   "name": "python"
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.13.9"
   }
  },
  "nbformat": 4,
- "nbformat_minor": 2
+ "nbformat_minor": 4
 }
diff --git a/README.md b/README.md
@@ -0,0 +1,16 @@
+* Code Notebook / Script: See Class 4 Homework.ipynb
+* Data & Index: See src/chunks.json & 
+* Retrieval Report: See retrieval_report.json
+* FastAPI Service: The FastAPI app code (e.g. main.py) and instructions on how to run it. The /search endpoint should be demonstrable (e.g. returning top-3 passages in JSON for sample queries).
+
+# Running FastApi code
+This model was trained on data about electrons. Queries should be related to electrons.
+
+Go to src and run uvicorn
+```bash
+cd src
+uvicorn main:app
+```
+
+
+Thank you to arXiv for use of its open access interoperability.