Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
84 changes: 84 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -205,3 +205,87 @@ cython_debug/
marimo/_static/
marimo/_lsp/
__marimo__/

# Covers JetBrains IDEs: IntelliJ, GoLand, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider
# Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839

# User-specific stuff
.idea/**/workspace.xml
.idea/**/tasks.xml
.idea/**/usage.statistics.xml
.idea/**/dictionaries
.idea/**/shelf

# AWS User-specific
.idea/**/aws.xml

# Generated files
.idea/**/contentModel.xml

# Sensitive or high-churn files
.idea/**/dataSources/
.idea/**/dataSources.ids
.idea/**/dataSources.local.xml
.idea/**/sqlDataSources.xml
.idea/**/dynamic.xml
.idea/**/uiDesigner.xml
.idea/**/dbnavigator.xml

# Gradle
.idea/**/gradle.xml
.idea/**/libraries

# Gradle and Maven with auto-import
# When using Gradle or Maven with auto-import, you should exclude module files,
# since they will be recreated, and may cause churn. Uncomment if using
# auto-import.
# .idea/artifacts
# .idea/compiler.xml
# .idea/jarRepositories.xml
# .idea/modules.xml
# .idea/*.iml
# .idea/modules
# *.iml
# *.ipr

# CMake
cmake-build-*/

# Mongo Explorer plugin
.idea/**/mongoSettings.xml

# File-based project format
*.iws

# IntelliJ
out/

# mpeltonen/sbt-idea plugin
.idea_modules/

# JIRA plugin
atlassian-ide-plugin.xml

# Cursive Clojure plugin
.idea/replstate.xml

# SonarLint plugin
.idea/sonarlint/
.idea/sonarlint.xml # see https://community.sonarsource.com/t/is-the-file-idea-idea-idea-sonarlint-xml-intended-to-be-under-source-control/121119

# Crashlytics plugin (for Android Studio and IntelliJ)
com_crashlytics_export_strings.xml
crashlytics.properties
crashlytics-build.properties
fabric.properties

# Editor-based HTTP Client
.idea/httpRequests
http-client.private.env.json

# Android studio 3.1+ serialized cache file
.idea/caches/build_file_checksums.ser

# Apifox Helper cache
.idea/.cache/.Apifox_Helper
.idea/ApifoxUploaderProjectSetting.xml
3 changes: 3 additions & 0 deletions .idea/.gitignore

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/Homework4-Submission.iml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/inspectionProfiles/profiles_settings.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

7 changes: 7 additions & 0 deletions .idea/misc.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

8 changes: 8 additions & 0 deletions .idea/modules.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions .idea/vcs.xml

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

165 changes: 144 additions & 21 deletions Class 4 Homework.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -80,19 +80,81 @@
"outputs": [],
"source": [
"import fitz # PyMuPDF\n",
"from typing import List\n",
"import urllib, urllib.request\n",
"import feedparser\n",
"import requests\n",
"from io import BytesIO\n",
"import ssl, certifi, urllib.request\n",
"\n",
"def extract_text_from_pdf(pdf_path: str) -> str:\n",
"context = ssl.create_default_context(cafile=certifi.where())\n",
"\n",
"def ssl_read_url(url: str) -> str:\n",
" return urllib.request.urlopen(url, context=context).read()\n",
"\n",
"def get_pdf_urls() -> List[str]:\n",
" url = f\"https://export.arxiv.org/api/query?search_query=all:electron&start=0&max_results=50\"\n",
" data = ssl_read_url(url)\n",
" res = feedparser.parse(data)\n",
" return [\n",
" link.href\n",
" for entry in res.entries\n",
" for link in entry.links\n",
" if \"pdf\" in link.href\n",
" ]\n",
"\n",
" \n",
"def extract_text_from_url(url: str) -> str:\n",
" \"\"\"\n",
" Open a PDF and extract all text as a single string.\n",
" \"\"\"\n",
" doc = fitz.open(pdf_path)\n",
" response = requests.get(url)\n",
"\n",
" pdf_bytes = BytesIO(response.content)\n",
" doc = fitz.open(stream=pdf_bytes, filetype=\"pdf\")\n",
" pages = []\n",
" for page in doc:\n",
" page_text = page.get_text() # get raw text from page\n",
" # (Optional) clean page_text here (remove headers/footers)\n",
" pages.append(page_text)\n",
" full_text = \"\\n\".join(pages)\n",
" return full_text\n"
" return full_text"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"50\n"
]
}
],
"source": [
"urls = get_pdf_urls()\n",
"print(len(urls))"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"50\n"
]
}
],
"source": [
"full_texts = [extract_text_from_url(url) for url in urls]\n",
"print(len(full_texts))"
]
},
{
Expand All @@ -104,7 +166,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -115,7 +177,33 @@
" for i in range(0, len(tokens), step):\n",
" chunk = tokens[i:i + max_tokens]\n",
" chunks.append(\" \".join(chunk))\n",
" return chunks\n"
" return chunks\n",
"\n",
"chunks = [\n",
" chunk \n",
" for text in full_texts \n",
" for chunk in chunk_text(text)\n",
"]\n",
"\n",
"with open(\"chunks.json\", \"w\", encoding=\"utf-8\") as f:\n",
" json.dump(chunks, f, ensure_ascii=False, indent=4)"
]
},
{
"cell_type": "code",
"execution_count": 28,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Wiley-Interscience Publication, p61. [11] E. Benedetto et al., Proceedings of PAC07, p. 4033 (2007).', 'Proceedings of PAC07, p. 4033 (2007).', 'parallel plates separated by 0.2 mm is depicted. The electric field, driving the electron, 0 0.05 0.1', 'ctron and Positron Accelerators (MBI97), KEK, Tsukuba, Japan, 15-18 July 1997, KEK Proceedings 97-17', 'build up sim- ulations at CERN, in Proc.of ECLOUD12, La Biodola, Isola d’Elba, Italy, 5-9 June 2012.']\n"
]
}
],
"source": [
"print([chunk[100:] for chunk in chunks[-5:]])"
]
},
{
Expand All @@ -128,19 +216,19 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 30,
"metadata": {},
"outputs": [],
"source": [
"from sentence_transformers import SentenceTransformer\n",
"from numpy import ndarray\n",
"\n",
"def chunk_text(text: str, max_tokens: int = 512, overlap: int = 50) -> List[str]:\n",
" tokens = text.split()\n",
" chunks = []\n",
" step = max_tokens - overlap\n",
" for i in range(0, len(tokens), step):\n",
" chunk = tokens[i:i + max_tokens]\n",
" chunks.append(\" \".join(chunk))\n",
" return chunks\n"
"model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n",
"\n",
"def get_embeddings(chunks: List[str]) -> ndarray:\n",
" return model.encode(chunks) \n",
"\n",
"embeddings = get_embeddings(chunks)"
]
},
{
Expand All @@ -153,11 +241,11 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"\n",
"import json\n",
"import faiss\n",
"import numpy as np\n",
"\n",
Expand All @@ -167,10 +255,31 @@
"index.add(np.array(embeddings)) # add all chunk vectors\n",
"\n",
"# Example: search for a query embedding\n",
"query_embedding = ... # get embedding for the query (shape: [1, dim])\n",
"queries = [\"How do electron holes in materials work\",\n",
" \"Explain how spin-background affects low-lying excitations\",\n",
" \"Explain the properties of metallic adatoms\",\n",
" \"Explain hole-spin qubits\",\n",
" \"Explain why it's hard to scale up quantum computers\"\n",
" ]\n",
"query_embeddings = model.encode(queries) # get embedding for the query (shape: [1, dim])\n",
"k = 3\n",
"distances, indices = index.search(query_embedding, k)\n",
"# indices[0] holds the top-k chunk indices\n"
"distances, indicesList = index.search(query_embeddings, k)\n",
"indicesList = indicesList.tolist()\n",
"retrieval_report = [\n",
" {\n",
" \"request\": queries[i],\n",
" \"response\": [chunks[index] for index in indices]\n",
" }\n",
" for i, indices in enumerate(indicesList)\n",
"]\n",
"\n",
"import pickle\n",
"\n",
"with open(\"index.pkl\", \"wb\") as f:\n",
" pickle.dump(index, f)\n",
"\n",
"with open(\"retrieval_report.json\", \"w\", encoding=\"utf-8\") as f:\n",
" json.dump(retrieval_report, f, ensure_ascii=False, indent=4)"
]
},
{
Expand Down Expand Up @@ -233,10 +342,24 @@
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"name": "python"
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.13.9"
}
},
"nbformat": 4,
"nbformat_minor": 2
"nbformat_minor": 4
}
16 changes: 16 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
* Code Notebook / Script: See Class 4 Homework.ipynb
* Data & Index: See src/chunks.json &
* Retrieval Report: See retrieval_report.json
* FastAPI Service: The FastAPI app code (e.g. main.py) and instructions on how to run it. The /search endpoint should be demonstrable (e.g. returning top-3 passages in JSON for sample queries).

# Running FastApi code
This model was trained on data about electrons. Queries should be related to electrons.

Go to src and run uvicorn
```bash
cd src
uvicorn main:app
```


Thank you to arXiv for use of its open access interoperability.
Loading