Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Evaluation script for summarization task w/ Rouge score #2079

Open
wants to merge 6 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
compute_environment: LOCAL_MACHINE
debug: false
distributed_type: MULTI_GPU
downcast_bf16: 'no'
gpu_ids: all
machine_rank: 0
main_training_function: main
mixed_precision: 'no'
num_machines: 1
num_processes: 4
rdzv_backend: static
same_network: true
tpu_env: []
tpu_use_cluster: false
tpu_use_sudo: false
use_cpu: false
Original file line number Diff line number Diff line change
@@ -0,0 +1,283 @@
import argparse
import json
import os

import datasets
import evaluate
import nltk
import torch
from accelerate import Accelerator
from lm_eval.utils import stop_sequences_criteria
from sparseml.pytorch.model_load.helpers import (
RECIPE_FILE_NAME, apply_recipe_structure_to_model)
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm
from transformers import (AutoModelForCausalLM, AutoTokenizer,
default_data_collator)

nltk.download("punkt")


ARTICLE_TEMPLATE = "Article:\n{article}"

SUMMARY_TEMPLATE = "\n\n### Summarization:\n"


def load_model(model_path):
model = AutoModelForCausalLM.from_pretrained(model_path)
input_recipe_path = os.path.join(model_path, RECIPE_FILE_NAME)
if os.path.exists(input_recipe_path):
apply_recipe_structure_to_model(
model=model, recipe_path=input_recipe_path, model_path=model_path
)
return model


def load_tokenizer(model_path):
tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side="left")
tokenizer.pad_token = tokenizer.eos_token
tokenizer.deprecation_warnings["Asking-to-pad-a-fast-tokenizer"] = True
return tokenizer


def postprocess_text(preds, labels, first_k_preds):
preds = [pred.strip() for pred in preds]
labels = [label.strip() for label in labels]

# ROUGE expects a newline after each sentence
preds = ["\n".join(nltk.sent_tokenize(pred)[:first_k_preds]) for pred in preds]
labels = ["\n".join(nltk.sent_tokenize(label)) for label in labels]

return preds, labels


def main(model_path, batch, dataset_path, dataset_name):
model = load_model(model_path)
tokenizer = load_tokenizer(model_path)

accelerator = Accelerator() if args.use_accelerate else None

with accelerator.main_process_first():
dataset = datasets.load_dataset("cnn_dailymail", "3.0.0", split="validation")
if args.samples > 0:
dataset = dataset.shuffle(seed=42).select(range(args.samples))

result_path = os.path.join(model_path, args.output_dir)
if not os.path.exists(result_path):
os.makedirs(result_path)

if args.generation == "lm-eval-harness":
gen_kwargs = {
"do_sample": False,
"temperature": 1.0, # To disable warning
"top_p": 1.0, # To disable warning
"bos_token_id": 1,
"eos_token_id": 2,
"pad_token_id": 0,
"max_new_tokens": 512,
}
elif args.generation == "top_k":
# Similar to GPT-2 decoding strategy used for summarization
# (see their paper, section 3.6)
gen_kwargs = {
"do_sample": True,
"top_k": args.top_k,
"max_new_tokens": args.max_new_tokens,
}
else:
raise ValueError(f"Unknown decoding strategy: {args.generation}")

def _process_sample(sample):
article = ARTICLE_TEMPLATE.format(article=sample["article"])
tok_summary = tokenizer(SUMMARY_TEMPLATE)

# Exclude the BOS from the tokenized summary
tok_summary = {k: tok_summary[k][1:] for k in tok_summary}

max_tok_article = args.max_input_length - len(tok_summary["input_ids"])
tok_article = tokenizer(
article, max_length=max_tok_article, truncation=True, padding="max_length"
)

model_inputs = {k: tok_article[k] + tok_summary[k] for k in tok_article}

prompt_length = len(model_inputs["input_ids"])
highlights = tokenizer(
sample["highlights"],
max_length=prompt_length,
truncation=True,
padding="max_length",
)
model_inputs["tok_highlights"] = highlights["input_ids"]

# Using "label" for sample ID since it will be recognized and reserved by
# the default data collator used below
model_inputs["label"] = hash(sample["id"])

return model_inputs

tokenized_dataset = dataset.map(_process_sample, batched=False, num_proc=16)
remove_columns = dataset.column_names
tokenized_dataset = tokenized_dataset.remove_columns(remove_columns)
tokenized_dataset.set_format("torch")

data_collator = default_data_collator
dataloader = DataLoader(
tokenized_dataset,
batch_size=batch,
shuffle=False,
num_workers=16,
pin_memory=True,
collate_fn=data_collator,
)
if accelerator is not None:
model, dataloader = accelerator.prepare(model, dataloader)

if accelerator.is_main_process:
saved_preds = {"ids": [], "predictions": [], "highlights": []}
rouge_score = evaluate.load("rouge")

model.eval()
for step, batch in enumerate(tqdm(dataloader)):
labels = batch["labels"]
with torch.no_grad():
if args.generation == "lm-eval-harness":
stop = ["\n\n", "Article:"]
initial_decoder_input_length = batch["input_ids"].shape[1]
batch_size = batch["input_ids"].shape[0]
stopping_criteria = stop_sequences_criteria(
tokenizer, stop, initial_decoder_input_length, batch_size
)
else:
stopping_criteria = None

prompt_length = batch["input_ids"].shape[1]
if args.use_accelerate:
generated_tokens = accelerator.unwrap_model(model).generate(
batch["input_ids"],
attention_mask=batch["attention_mask"],
stopping_criteria=stopping_criteria,
**gen_kwargs,
)
generated_tokens = accelerator.pad_across_processes(
generated_tokens, dim=1, pad_index=tokenizer.pad_token_id
)
highlights = batch["tok_highlights"]

generated_tokens = accelerator.gather(generated_tokens).cpu().numpy()
highlights = accelerator.gather(highlights).cpu().numpy()
labels = accelerator.gather(labels).cpu().numpy()
else:
# Code path for debugging only with 1 GPU
batch = {k: batch[k].to(model.device) for k in batch.keys()}
generated_tokens = model.generate(
batch["input_ids"],
attention_mask=batch["attention_mask"],
stopping_criteria=stopping_criteria,
**gen_kwargs,
)
highlights = batch["tok_highlights"]

generated_tokens = generated_tokens.cpu().numpy()
highlights = highlights.cpu().numpy()
labels = labels.cpu().numpy()
batch = None
torch.cuda.empty_cache()

if isinstance(generated_tokens, tuple):
generated_tokens = generated_tokens[0]

generated_summary_tokens = generated_tokens[:, prompt_length:]
decoded_preds = tokenizer.batch_decode(
generated_summary_tokens, skip_special_tokens=True
)
decoded_highlights = tokenizer.batch_decode(
highlights, skip_special_tokens=True
)
decoded_preds, decoded_highlights = postprocess_text(
decoded_preds, decoded_highlights, args.first_k_preds
)

assert len(labels) == len(decoded_preds) == len(decoded_highlights)

if accelerator.is_main_process:
saved_preds["ids"] += labels.tolist()
saved_preds["predictions"] += decoded_preds
saved_preds["highlights"] += decoded_highlights

if accelerator.is_main_process:
results = rouge_score.compute(
predictions=saved_preds["predictions"], references=saved_preds["highlights"]
)
print(f"Rouge score: {results}")

with open(os.path.join(result_path, f"predictions.json"), "w") as f:
json.dump(saved_preds, f)

result_file_name = (
f"rouge_{args.samples}samples.json"
if args.samples > 0
else f"rouge_full_validation.json"
)
results.update(
{
"generation": args.generation,
"generation_config": gen_kwargs,
"prompt": ARTICLE_TEMPLATE + SUMMARY_TEMPLATE,
}
)
result_file_path = os.path.join(result_path, result_file_name)
assert not os.path.exists(
result_file_path
), f"File {result_file_path} already exists! Results will not be saved."
with open(result_file_path, "w") as f:
json.dump(results, f)


if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Compute ROUGE score")
parser.add_argument(
"--use-accelerate",
type=bool,
default=False,
help="Use accelerate. Default: False",
)
parser.add_argument("--model-path", type=str, help="model path")
parser.add_argument(
"--output-dir", type=str, default="rouge", help="Output directory"
)
parser.add_argument(
"--max-new-tokens", type=int, default=512, help="Max new tokens"
)
parser.add_argument(
"--max-input-length",
type=int,
default=2048,
help="Max tokenized input length to model",
)
parser.add_argument(
"--first-k-preds", type=int, default=-1, help="Use first K predictions"
)
parser.add_argument("--batch", type=int, default=8, help="Batch size")
parser.add_argument(
"--samples", type=int, default=-1, help="Numer of samples. Default to all."
)
parser.add_argument(
"--generation",
type=str,
default="lm-eval-harness",
help="Generation strategies: lm-eval-harness, top_k",
)
parser.add_argument(
"--top-k", type=int, default=10, help="top_k in the top_k stategy"
)
parser.add_argument(
"--dataset-path", type=str, default="cnn_dailymail", help="dataset path"
)
parser.add_argument(
"--dataset-name", type=str, default="3.0.0", help="dataset name"
)

args = parser.parse_args()

main(args.model_path, args.batch, args.dataset_path, args.dataset_name)
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
#!/bin/bash

export CUDA_VISIBLE_DEVICES=0,1,2,3
NPROC=$(($(echo $CUDA_VISIBLE_DEVICES | grep -o "," | wc -l)+1))

SRC_ROOT=$HOME/work/llama2.cnn_dailymail.eval/src/my_scripts

source $SRC_ROOT/start_here.sh

for MODEL_NAME in sparse_ft@SRCcerebras50@lr1e-4@WD0.0@B8@GrAcc8@W0.1@ep2@GPUs7@ID15577
do
M=$HOME/models/llama2/cnn_dailymail/llama-recipes/sparse_finetuned/$MODEL_NAME
accelerate launch --config_file $SRC_ROOT/accelerate_default_config.${NPROC}gpus.yaml $SRC_ROOT/rouge_accelerate.py --model-path $M --batch 2 --samples 16 --generation top_k --top-k 2 --max-new-tokens 100 --first-k-preds 3 --use-accelerate 1 --output-dir rouge
done

Loading