-
Notifications
You must be signed in to change notification settings - Fork 2
/
bayesian_summarization.py
84 lines (65 loc) · 2.85 KB
/
bayesian_summarization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
import argparse
import os
import random
import torch
from src.bayesian_summarization.bayesian import BayesianSummarizer
from src.common.loaders import init_loader, load_model
from src.common.scoring import score_standard
def read_args():
parser = argparse.ArgumentParser()
parser.add_argument("--model_path", type=str, help="")
parser.add_argument("--output_path", type=str, help="")
parser.add_argument("--dataset_name", type=str, help="")
parser.add_argument("--dataset_config_name", type=str, help="")
parser.add_argument("--data_path", type=str, help="")
parser.add_argument("--text_column", type=str, help="")
parser.add_argument("--summary_column", type=str, help="")
parser.add_argument("--tokenizer_name", type=str, help="")
parser.add_argument("--max_source_length", type=int, default=256, help="")
parser.add_argument("--max_summary_length", type=int, default=128, help="")
parser.add_argument("--max_test_samples", type=int, help="")
parser.add_argument("--mc_samples", type=int, default=10, help="")
parser.add_argument("--seed", type=int, default=10, help="")
parser.add_argument("--test_batch_size", type=int, default=8, help="")
parser.add_argument("--num_beams", type=int, default=3, help="")
args, unknown = parser.parse_known_args()
return args, unknown
def main():
# CUDA for PyTorch
use_cuda = torch.cuda.is_available()
device = torch.device("cuda:0" if use_cuda else "cpu")
torch.backends.cudnn.benchmark = True
args, unknown = read_args()
random.seed(args.seed)
torch.manual_seed(args.seed)
if not os.path.exists(args.output_path):
os.mkdir(args.output_path)
test_loader = init_loader(
test_batch_size=args.test_batch_size,
split="test",
data_path=args.data_path,
dataset_name=args.dataset_name,
dataset_config_name=args.dataset_config_name, max_test_samples=args.max_test_samples)
bayesian_summarizer = BayesianSummarizer(
model_name_or_path=args.model_path,
tokenizer_name=args.tokenizer_name,
text_column=args.text_column,
summary_column=args.summary_column,
seed=args.seed,
max_source_length=args.max_source_length,
num_beams=args.num_beams,
)
bayesian_summarizer.init_sum()
generated_sums, target_sums, article_ids, bleuvars = bayesian_summarizer.generate_bayesian_summaries(
dataloader=test_loader, n=args.mc_samples)
metrics, mdf = score_standard(
gen_sums=generated_sums,
target_sums=target_sums,
article_ids=article_ids)
mdf["bleuvar"] = bleuvars
print(mdf)
print(metrics)
mdf.to_csv(os.path.join(args.output_path, "generated_sums.csv"), sep="\t", index=False)
metrics.to_csv(os.path.join(args.output_path, "metrics.csv"), index=False)
if __name__ == "__main__":
main()