Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 98 additions & 0 deletions .virtual_documents/Class 7 Homework.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@



import json

system_prompt = "You are a helpful academic Q&A assistant specialized in scholarly content."
data = []

# Suppose qas_list is a list of all generated QAs, where each QA is a dict: {"question": ..., "answer": ...}
for qa in qas_list:
user_q = qa["question"]
assistant_a = qa["answer"]
# Compose the prompt with system, user, assistant roles
full_prompt = f"<|system|>{system_prompt}<|user|>{user_q}<|assistant|>{assistant_a}"
data.append({"text": full_prompt})

# Write to JSONL file
with open("synthetic_qa.jsonl", "w") as outfile:
for entry in data:
outfile.write(json.dumps(entry) + "\n")








from unsloth import FastLanguageModel, SFTTrainer
from transformers import AutoTokenizer, TrainingArguments
from datasets import load_dataset

# Load the base LLaMA 3 7B model in 4-bit mode (dynamic 4-bit quantization)
model_name = "unsloth/llama-3.1-7b-unsloth-bnb-4bit"
model = FastLanguageModel.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=False)

# Load our synthetic Q&A dataset
dataset = load_dataset("json", data_files="synthetic_qa.jsonl", split="train")

# Initialize the trainer for Supervised Fine-Tuning (SFT)
trainer = SFTTrainer(
model=model,
tokenizer=tokenizer,
train_dataset=dataset,
dataset_text_field="text",
args=TrainingArguments(
output_dir="llama3-7b-qlora-finetuned",
per_device_train_batch_size=4, # small batch size for Colab GPU
gradient_accumulation_steps=4, # accumulate gradients to simulate larger batch
num_train_epochs=2,
learning_rate=2e-4,
fp16=True,
logging_steps=50,
save_strategy="epoch"
)
)

trainer.train()
model.save_pretrained("llama3-7b-qlora-finetuned")





# Define some test questions (ensure these were not exactly in training data)
test_questions = [
"What is the main hypothesis proposed by the paper on quantum computing?",
"How did the authors of the deep learning study evaluate their model's performance?",
# ... (add total 10 questions)
]

# Load the base and fine-tuned models for inference
base_model = FastLanguageModel.from_pretrained(model_name) # base 7B model
ft_model = FastLanguageModel.from_pretrained("llama3-7b-qlora-finetuned")

for q in test_questions:
prompt_input = f"<|system|>{system_prompt}<|user|>{q}<|assistant|>"
# Tokenize input and generate output with each model
input_ids = tokenizer(prompt_input, return_tensors='pt').input_ids.cuda()
base_output_ids = base_model.generate(input_ids, max_new_tokens=150)
ft_output_ids = ft_model.generate(input_ids, max_new_tokens=150)
# Decode the outputs
base_answer = tokenizer.decode(base_output_ids[0], skip_special_tokens=True)
ft_answer = tokenizer.decode(ft_output_ids[0], skip_special_tokens=True)
# (Post-process to remove the prompt part if needed)
base_answer = base_answer.split('<|assistant|>')[-1].strip()
ft_answer = ft_answer.split('<|assistant|>')[-1].strip()
print(f"Q: {q}")
print(f"Base Model Answer: {base_answer}")
print(f"Fine-Tuned Model Answer: {ft_answer}")
print("-" * 60)






66 changes: 66 additions & 0 deletions .virtual_documents/Data Collection.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
!pip install PyMuPDF
!pip install feedparser


import fitz # PyMuPDF
from typing import List
import urllib, urllib.request
import feedparser
import requests
from io import BytesIO
import ssl, certifi, urllib.request

context = ssl.create_default_context(cafile=certifi.where())

def ssl_read_url(url: str) -> str:
return urllib.request.urlopen(url, context=context).read()

def get_pdf_urls() -> List[str]:
url = f"https://export.arxiv.org/api/query?search_query=all:a&start=0&max_results=100"
data = ssl_read_url(url)
res = feedparser.parse(data)
return [
link.href
for entry in res.entries
for link in entry.links
if "pdf" in link.href
]


def extract_text_from_url(url: str) -> str:
"""
Open a PDF and extract all text as a single string.
"""
response = requests.get(url)

pdf_bytes = BytesIO(response.content)
doc = fitz.open(stream=pdf_bytes, filetype="pdf")
pages = []
for page in doc:
page_text = page.get_text() # get raw text from page
pages.append(page_text)
full_text = "\n".join(pages)
return full_text


urls = get_pdf_urls()
print(len(urls))


import json
with open("data/papers/paper_urls.json", "w") as file:
json.dump(urls, file)


papers = [
extract_text_from_url(url) for url in urls
]
print(len(papers))


import json
with open("data/papers/papers.json", "w") as file:
json.dump(papers, file)



127 changes: 127 additions & 0 deletions .virtual_documents/Data Generation.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,127 @@
def strip_block_quotes(response):
lines = response.split("\n")
if "```" in lines[0] and "```" in lines[-1]:
return '\n'.join(lines[1:-1])
else:
return response

def escape_backslashes(s: str) -> str:
return s.replace("\\", "\\\\")


from ollama import chat
import json

initial_messages = [
{
"role": "system",
"content": """
You will be generating synthetic data for supervised fine tuning.
The user will provide you with a research paper.
You will provide 5 questions and answers. The questions will be research questions which relate to the topic of the paper, but not referencing the paper itself.
One question and answer pair will have a quesion related to the paper topic, but which is unanswered by it, and the answer should inform the user that the agent does not know the answer, and explain why (e.g. if it requires more research in the paper, or simply is not in the paper, so is not in the knowledge domain of the model).
The question answer pair will be in the form {"question": string, "answer": string}. You will return them in an array in JSON format.
"""
}
]
model = "gpt-oss:20b-cloud"
max_length = 200000

def generate_qa(paper):
paper = paper[:max_length] # Cutoff if paper is too large for model to handle
response = chat(
model=model,
messages=[
*initial_messages,
{
"role": "user",
"content": paper
}
])
raw = response["message"]["content"]
stripped = strip_block_quotes(response["message"]["content"])

content = escape_backslashes(stripped)
pairs = json.loads(content)
if len(pairs) < 5:
raise Exception(f"Unexpected number of pairs: {len(pairs)}. Content: {content}")
for pair in pairs:
if not pair["question"] or not pair["answer"]:
raise Exception(f"Unexpected format: {content}")
return pairs


import json
with open("data/papers/papers.json") as file:
papers = json.load(file)


if not paper_to_pairs:
paper_to_pairs = dict() # Store papers mapped to pairs in case some fail, we can retry and add to dictionary later
else:
print("exists, skipping")


for paper in papers:
opening = paper[:100]
if opening not in paper_to_pairs:
print(opening)
try:
paper_to_pairs[opening] = generate_qa(paper)
except:
print("Failed, skipping")
pass
else:
print("Pairs already finished, skipping")





print(len(papers[0]))


print(len(papers[1]))


print(len(papers[1][:1000000]))


print(len(paper_to_pairs))


import json
with open("temp.json", "w") as file:
json.dump(paper_to_pairs, file)


import json
with open("temp.json", "r") as file:
paper_to_pairs = json.load(file)


qa_pairs = [
pair
for _, value in paper_to_pairs.items()
for pair in value
]
print(len(qa_pairs))
print(qa_pairs[0])


def to_formatted(pair):
user_q = pair["question"]
assistant_a = pair["answer"]
# Compose the prompt with system, user, assistant roles
return f"<|system|>{system_prompt}<|user|>{user_q}<|assistant|>{assistant_a}"



import json
with open("data/papers/formatted.jsonl", "w") as file:
for pair in qa_pairs:
json.dump(to_formatted(pair), file)
file.write("\n")



Loading