From d3da4b22a95f2ab011cd031cdb52b89a6bc58574 Mon Sep 17 00:00:00 2001
From: GVN2307 <veeranarayana1gorantla@gmail.com>
Date: Thu, 12 Feb 2026 17:27:20 +0530
Subject: [PATCH] Fix inference paths, add requirements.txt, and update README

---
 README.md        | 37 ++++++++++++++++++++++------
 inference.py     | 64 +++++++++++++++++++++++++++++++-----------------
 requirements.txt |  7 ++++++
 3 files changed, 77 insertions(+), 31 deletions(-)
 create mode 100644 requirements.txt

diff --git a/README.md b/README.md
index fdd8f7e..7222d8d 100644
--- a/README.md
+++ b/README.md
@@ -83,14 +83,35 @@ This project provides the fine-tuned adapter weights:
 
 ---
 
-## Inference Example
-
-An example script (`inference.py`) is provided to demonstrate how to:
-- Load the Qwen2.5-Math-1.5B base model
-- Attach the fine-tuned LoRA adapter
-- Run step-by-step math inference
-
-Note: Running the script requires downloading the base model from Hugging Face.
+## Setup and Usage
+
+### Prerequisites
+- Python 3.8+
+- CUDA-enabled GPU (recommended, e.g., NVIDIA T4 or better)
+
+### Installation
+1. Clone the repository:
+   ```bash
+   git clone https://github.com/AshChadha-iitg/OpenMath.git
+   cd OpenMath
+   ```
+2. Install dependencies:
+   ```bash
+   pip install -r requirements.txt
+   ```
+
+### Running Inference
+The `inference.py` script is configured to load the fine-tuned LoRA adapter from the current directory and run a sample math problem.
+
+```bash
+python inference.py
+```
+
+You can also import the `solve_problem` function into your own Python scripts:
+```python
+from inference import solve_problem
+print(solve_problem("Your math problem here"))
+```
 
 ---
 
diff --git a/inference.py b/inference.py
index a8d15bf..5c93670 100644
--- a/inference.py
+++ b/inference.py
@@ -22,9 +22,9 @@
 # CONFIG (MATCHES YOUR TRAINING)
 # ==========================
 BASE_MODEL = "Qwen/Qwen2.5-Math-1.5B"
-ADAPTER_PATH = "./openmath-lora"   # <-- PUT YOUR ADAPTER HERE
+ADAPTER_PATH = "."  # Weights are in the root directory
 
-# 4-bit QLoRA config (same as your T4 training)
+# 4-bit QLoRA config
 bnb_config = BitsAndBytesConfig(
     load_in_4bit=True,
     bnb_4bit_quant_type="nf4",
@@ -35,9 +35,14 @@
 # ==========================
 # LOAD TOKENIZER + MODEL
 # ==========================
+print(f"Loading tokenizer and model: {BASE_MODEL}...")
 tokenizer = AutoTokenizer.from_pretrained(BASE_MODEL)
 tokenizer.pad_token = tokenizer.eos_token
 
+# Check for CUDA
+if not torch.cuda.is_available():
+    print("Warning: CUDA is not available. Loading on CPU might be extremely slow or fail for 4-bit quantization.")
+
 base_model = AutoModelForCausalLM.from_pretrained(
     BASE_MODEL,
     quantization_config=bnb_config,
@@ -45,33 +50,46 @@
 )
 
 # Attach your fine-tuned LoRA adapter
-model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
-model.eval()
+print(f"Attaching adapter from {ADAPTER_PATH}...")
+try:
+    model = PeftModel.from_pretrained(base_model, ADAPTER_PATH)
+    model.eval()
+except Exception as e:
+    print(f"Error loading adapter: {e}")
+    exit(1)
 
 # Silence padding warning
 model.generation_config.pad_token_id = tokenizer.eos_token_id
 
 # ==========================
-# OPENMATH PROMPT (MUST MATCH TRAINING)
+# OPENMATH PROMPT
 # ==========================
-prompt = (
-"### Instruction:\n"
-"Solve the math problem step by step and give the final answer.\n\n"
-"### Problem:\n"
-"If a store sells pencils at 3 for $1, how much do 15 pencils cost?\n\n"
-"### Solution:\n"
-)
+def solve_problem(problem_text):
+    prompt = (
+        "### Instruction:\n"
+        "Solve the math problem step by step and give the final answer.\n\n"
+        "### Problem:\n"
+        f"{problem_text}\n\n"
+        "### Solution:\n"
+    )
 
-inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
+    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
 
-with torch.no_grad():
-    outputs = model.generate(
-        **inputs,
-        max_new_tokens=200,
-        do_sample=False,        # deterministic (better for math)
-        repetition_penalty=1.1,
-        no_repeat_ngram_size=3,
-    )
+    with torch.no_grad():
+        outputs = model.generate(
+            **inputs,
+            max_new_tokens=512,
+            do_sample=False,        
+            repetition_penalty=1.1,
+            no_repeat_ngram_size=3,
+        )
+
+    return tokenizer.decode(outputs[0], skip_special_tokens=True)
 
-print("\n===== OPENMATH OUTPUT =====\n")
-print(tokenizer.decode(outputs[0], skip_special_tokens=True))
+if __name__ == "__main__":
+    example_problem = "If a store sells pencils at 3 for $1, how much do 15 pencils cost?"
+    print("\n===== OPENMATH INFERENCE =====")
+    print(f"Problem: {example_problem}")
+    result = solve_problem(example_problem)
+    print("\n===== OUTPUT =====\n")
+    print(result)
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000..f37e6dd
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,7 @@
+torch
+transformers
+peft
+bitsandbytes
+accelerate
+safetensors
+sentencepiece