inference-ai-course · MichaelXian · Dec 20, 2025
diff --git a/.virtual_documents/Class 7 Homework.ipynb b/.virtual_documents/Class 7 Homework.ipynb
@@ -0,0 +1,98 @@
+
+
+
+import json
+
+system_prompt = "You are a helpful academic Q&A assistant specialized in scholarly content."
+data = []
+
+# Suppose qas_list is a list of all generated QAs, where each QA is a dict: {"question": ..., "answer": ...}
+for qa in qas_list:
+    user_q = qa["question"]
+    assistant_a = qa["answer"]
+    # Compose the prompt with system, user, assistant roles
+    full_prompt = f"<|system|>{system_prompt}<|user|>{user_q}<|assistant|>{assistant_a}"
+    data.append({"text": full_prompt})
+
+# Write to JSONL file
+with open("synthetic_qa.jsonl", "w") as outfile:
+    for entry in data:
+        outfile.write(json.dumps(entry) + "\n")
+
+
+
+
+
+
+
+
+from unsloth import FastLanguageModel, SFTTrainer
+from transformers import AutoTokenizer, TrainingArguments
+from datasets import load_dataset
+
+# Load the base LLaMA 3 7B model in 4-bit mode (dynamic 4-bit quantization)
+model_name = "unsloth/llama-3.1-7b-unsloth-bnb-4bit"
+model = FastLanguageModel.from_pretrained(model_name)
+tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)
+
+# Load our synthetic Q&A dataset
+dataset = load_dataset("json", data_files="synthetic_qa.jsonl", split="train")
+
+# Initialize the trainer for Supervised Fine-Tuning (SFT)
+trainer = SFTTrainer(
+    model=model,
+    tokenizer=tokenizer,
+    train_dataset=dataset,
+    dataset_text_field="text",
+    args=TrainingArguments(
+        output_dir="llama3-7b-qlora-finetuned",
+        per_device_train_batch_size=4,   # small batch size for Colab GPU
+        gradient_accumulation_steps=4,   # accumulate gradients to simulate larger batch
+        num_train_epochs=2,
+        learning_rate=2e-4,
+        fp16=True,
+        logging_steps=50,
+        save_strategy="epoch"
+    )
+)
+
+trainer.train()
+model.save_pretrained("llama3-7b-qlora-finetuned")
+
+
+
+
+
+# Define some test questions (ensure these were not exactly in training data)
+test_questions = [
+    "What is the main hypothesis proposed by the paper on quantum computing?",
+    "How did the authors of the deep learning study evaluate their model's performance?",
+    # ... (add total 10 questions)
+]
+
+# Load the base and fine-tuned models for inference
+base_model = FastLanguageModel.from_pretrained(model_name)  # base 7B model
+ft_model = FastLanguageModel.from_pretrained("llama3-7b-qlora-finetuned")
+
+for q in test_questions:
+    prompt_input = f"<|system|>{system_prompt}<|user|>{q}<|assistant|>"
+    # Tokenize input and generate output with each model
+    input_ids = tokenizer(prompt_input, return_tensors='pt').input_ids.cuda()
+    base_output_ids = base_model.generate(input_ids, max_new_tokens=150)
+    ft_output_ids  = ft_model.generate(input_ids, max_new_tokens=150)
+    # Decode the outputs
+    base_answer = tokenizer.decode(base_output_ids[0], skip_special_tokens=True)
+    ft_answer   = tokenizer.decode(ft_output_ids[0], skip_special_tokens=True)
+    # (Post-process to remove the prompt part if needed)
+    base_answer = base_answer.split('<|assistant|>')[-1].strip()
+    ft_answer   = ft_answer.split('<|assistant|>')[-1].strip()
+    print(f"Q: {q}")
+    print(f"Base Model Answer: {base_answer}")
+    print(f"Fine-Tuned Model Answer: {ft_answer}")
+    print("-" * 60)
+
+
+
+
+
+
diff --git a/.virtual_documents/Data Collection.ipynb b/.virtual_documents/Data Collection.ipynb
@@ -0,0 +1,66 @@
+!pip install PyMuPDF
+!pip install feedparser
+
+
+import fitz  # PyMuPDF
+from typing import List
+import urllib, urllib.request
+import feedparser
+import requests
+from io import BytesIO
+import ssl, certifi, urllib.request
+
+context = ssl.create_default_context(cafile=certifi.where())
+
+def ssl_read_url(url: str) -> str:
+    return urllib.request.urlopen(url, context=context).read()
+
+def get_pdf_urls() -> List[str]:
+    url = f"https://export.arxiv.org/api/query?search_query=all:a&start=0&max_results=100"
+    data = ssl_read_url(url)
+    res = feedparser.parse(data)
+    return [
+            link.href
+            for entry in res.entries
+            for link in entry.links
+            if "pdf" in link.href
+          ]
+
+
+def extract_text_from_url(url: str) -> str:
+    """
+    Open a PDF and extract all text as a single string.
+    """
+    response = requests.get(url)
+
+    pdf_bytes = BytesIO(response.content)
+    doc = fitz.open(stream=pdf_bytes, filetype="pdf")
+    pages = []
+    for page in doc:
+        page_text = page.get_text()  # get raw text from page
+        pages.append(page_text)
+    full_text = "\n".join(pages)
+    return full_text
+
+
+urls = get_pdf_urls()
+print(len(urls))
+
+
+import json
+with open("data/papers/paper_urls.json", "w") as file:
+    json.dump(urls, file)
+
+
+papers = [
+    extract_text_from_url(url) for url in urls
+]
+print(len(papers))
+
+
+import json
+with open("data/papers/papers.json", "w") as file:
+    json.dump(papers, file)
+
+
+
diff --git a/.virtual_documents/Data Generation.ipynb b/.virtual_documents/Data Generation.ipynb
@@ -0,0 +1,127 @@
+def strip_block_quotes(response):
+    lines = response.split("\n")
+    if "```" in lines[0] and "```" in lines[-1]:
+        return '\n'.join(lines[1:-1])
+    else:
+        return response
+
+def escape_backslashes(s: str) -> str:
+    return s.replace("\\", "\\\\")
+
+
+from ollama import chat
+import json
+
+initial_messages = [
+    {
+        "role": "system",
+        "content": """
+        You will be generating synthetic data for supervised fine tuning. 
+        The user will provide you with a research paper. 
+        You will provide 5 questions and answers. The questions will be research questions which relate to the topic of the paper, but not referencing the paper itself.
+        One question and answer pair will have a quesion related to the paper topic, but which is unanswered by it, and the answer should inform the user that the agent does not know the answer, and explain why (e.g. if it requires more research in the paper, or simply is not in the paper, so is not in the knowledge domain of the model).
+        The question answer pair will be in the form {"question": string, "answer": string}. You will return them in an array in JSON format.
+        """
+    }
+]
+model = "gpt-oss:20b-cloud"
+max_length = 200000 
+
+def generate_qa(paper):
+    paper = paper[:max_length] # Cutoff if paper is too large for model to handle
+    response = chat(
+        model=model,
+        messages=[
+            *initial_messages,
+            {
+                "role": "user",
+                "content": paper
+            }
+        ])
+    raw = response["message"]["content"]
+    stripped = strip_block_quotes(response["message"]["content"])
+
+    content = escape_backslashes(stripped)
+    pairs = json.loads(content)
+    if len(pairs) < 5:
+        raise Exception(f"Unexpected number of pairs: {len(pairs)}. Content: {content}")
+    for pair in pairs:
+        if not pair["question"] or not pair["answer"]:
+            raise Exception(f"Unexpected format: {content}")
+    return pairs
+
+
+import json
+with open("data/papers/papers.json") as file:
+    papers = json.load(file)
+
+
+if not paper_to_pairs:
+    paper_to_pairs = dict() # Store papers mapped to pairs in case some fail, we can retry and add to dictionary later
+else:
+    print("exists, skipping")
+
+
+for paper in papers:
+    opening = paper[:100]
+    if opening not in paper_to_pairs:
+        print(opening)
+        try:
+            paper_to_pairs[opening] = generate_qa(paper)
+        except:
+            print("Failed, skipping")
+            pass
+    else:
+        print("Pairs already finished, skipping")
+
+
+
+
+
+print(len(papers[0]))
+
+
+print(len(papers[1]))
+
+
+print(len(papers[1][:1000000]))
+
+
+print(len(paper_to_pairs))
+
+
+import json
+with open("temp.json", "w") as file:
+    json.dump(paper_to_pairs, file)
+
+
+import json
+with open("temp.json", "r") as file:
+    paper_to_pairs = json.load(file)
+
+
+qa_pairs = [
+    pair 
+    for _, value in paper_to_pairs.items()
+    for pair in value
+]
+print(len(qa_pairs))
+print(qa_pairs[0])
+
+
+def to_formatted(pair):
+    user_q = pair["question"]
+    assistant_a = pair["answer"]
+    # Compose the prompt with system, user, assistant roles
+    return f"<|system|>{system_prompt}<|user|>{user_q}<|assistant|>{assistant_a}"
+
+
+
+import json
+with open("data/papers/formatted.jsonl", "w") as file:
+    for pair in qa_pairs:
+        json.dump(to_formatted(pair), file)
+        file.write("\n")
+
+
+