improved memory usage

davidheineman · davidheineman · commit b9ffd3296c88 · 2025-01-12T16:45:35.000-08:00
diff --git a/README.md b/README.md
@@ -60,7 +60,7 @@ python src/index.py
 **Deploy Web Server**
 ```sh
 # Start an API endpoint
-gunicorn -w 4 -b 0.0.0.0:8080 server:app
+gunicorn -w 1 --threads 100 --worker-class gthread -b 0.0.0.0:8080 src.server:app
 
 # Then visit:
 # http://localhost:8080
diff --git a/fly.toml b/fly.toml
@@ -14,6 +14,6 @@ primary_region = 'lax'
   processes = ['app']
 
 [[vm]]
-  memory = '4gb'
+  memory = '2gb'
   cpu_kind = 'shared'
   cpus = 2
diff --git a/src/server.py b/src/server.py
@@ -1,26 +1,28 @@
 import os, math, re, requests, io, time
 from typing import List, Optional, Union
 
+import sys
+from pathlib import Path
+sys.path.append(str(Path(__file__).parent))
+
 from flask import Flask, abort, request, render_template, jsonify
 from functools import lru_cache
 
 from constants import INDEX_PATH, VENUES
-
 from search import ColBERT
 from db import create_database, query_paper_metadata
 from utils import download_index_from_hf, print_estimate_cost
 
 import PyPDF2
 from openai import OpenAI
 
+from threading import local
+thread_local = local()
+
 PORT = int(os.getenv("PORT", 8080))
 app = Flask(__name__)
-
-# Load OpenAI API key
-if os.path.exists('.openai-api-key'):
-    with open('.openai-api-key', 'r') as f:
-        api_key = f.read().strip()
-    client = OpenAI(api_key=api_key)
+colbert = None
+_is_initialized = False
 
 @lru_cache(maxsize=1000000)
 def api_search_query(query):
@@ -114,17 +116,26 @@ def get_pdf_text(pdf_url: str) -> str:
     return " ".join(page.extract_text() for page in pdf_reader.pages)
 
 
+def get_openai_client():
+    if not hasattr(thread_local, 'client'):
+        if os.path.exists('.openai-api-key'):
+            with open('.openai-api-key', 'r') as f:
+                api_key = f.read().strip()
+            thread_local.client = OpenAI(api_key=api_key)
+    return thread_local.client
+
+
 @app.route('/api/llm', methods=['POST'])
 def query_llm():
+    print(f'Started a new query!')
+    client = get_openai_client()
     data = request.json
     title = data['title']
     abstract = data['abstract'] 
     question = data['question']
     pdf_url = data['pdf_url'] if 'pdf_url' in data else None
 
     try:
-        print(f'Started a new query!')
-
         start_time = time.time()
         pdf_text = get_pdf_text(pdf_url)
         print(f"PDF text extraction took {time.time() - start_time:.2f} seconds")
@@ -135,7 +146,7 @@ def query_llm():
 Question: {question}
 Please provide a concise answer."""
 
-        print_estimate_cost(prompt, model='gpt-4o-mini', input_cost=0.15, output_cost=0.6, estimated_output_toks=100)
+        # print_estimate_cost(prompt, model='gpt-4o-mini', input_cost=0.15, output_cost=0.6, estimated_output_toks=100)
         
         start_llm_time = time.time()
         response = client.chat.completions.create(
@@ -157,20 +168,29 @@ def query_llm():
         return jsonify({'error': 'Failed to process request'}), 500
 
 
+def init_app():
+    global colbert, _is_initialized
+    if not _is_initialized:
+        download_index_from_hf()
+        create_database()
+        colbert = ColBERT(index_path=INDEX_PATH)
+        _is_initialized = True
+        
+        # Remove client initialization from here since we're using thread-local storage
+
+
+@app.before_request
+def before_request():
+    init_app()
+
+
 if __name__ == "__main__":
     """
     Example usage:
     python server.py
     http://localhost:8080/api/colbert?query=Information retrevial with BERT
     http://localhost:8080/api/search?query=Information retrevial with BERT
     """
-    download_index_from_hf()
-    create_database()
-    global colbert
-    colbert = ColBERT(index_path=INDEX_PATH)
-    print(colbert.search('text simplificaiton'))
-    print(api_search_query("text simplification")['topk'][:5])
-    
     # Watch web dirs for changes
     extra_files = [os.path.join(dirname, filename) for dirname, _, files in os.walk('templates') for filename in files]