Datasets

Note

UnsafeQA, PromptsDE, PromptsFR, PromptsIT, and PromptsES will be released soon!

Dataset	Category	Sub-category	Total	Unsafe	Alias
AdvBench Behaviors	Prompts	Instructions	520	100%	advbench_behaviors
HarmBench Behaviors	Prompts	Instructions	320	100%	harmbench_behaviors
I-CoNa	Prompts	Instructions	178	100%	i_cona
I-Controversial	Prompts	Instructions	40	100%	i_controversial
I-MaliciousInstructions	Prompts	Instructions	100	100%	i_malicious_instructions
I-Physical-Safety	Prompts	Instructions	200	50%	i_physical_safety
MaliciousInstruct	Prompts	Instructions	100	100%	malicious_instruct
MITRE	Prompts	Instructions	977	100%	mitre
StrongREJECT Instructions	Prompts	Instructions	213	100%	strong_reject_instructions
TDCRedTeaming	Prompts	Instructions	50	100%	tdc_red_teaming
CatQA	Prompts	Questions	550	100%	cat_qa
Do Anything Now Questions	Prompts	Questions	390	100%	do_anything_now_questions
DoNotAnswer	Prompts	Questions	939	100%	do_not_answer
HarmfulQ	Prompts	Questions	200	100%	harmful_q
HarmfulQA Questions	Prompts	Questions	1960	100%	harmful_qa_questions
HEx-PHI	Prompts	Questions	330	100%	hex_phi
XSTest	Prompts	Questions	450	44%	xstest
AdvBench Strings	Prompts	Statements	574	100%	advbench_strings
DecodingTrust Stereotypes	Prompts	Statements	1152	100%	decoding_trust_stereotypes
DynaHate	Prompts	Statements	4120	55%	dynahate
HateCheck	Prompts	Statements	3728	69%	hatecheck
Hatemoji Check	Prompts	Statements	3683	71%	hatemoji_check
SafeText	Prompts	Statements	1465	25%	safe_text
ToxiGen	Prompts	Statements	940	43%	toxigen
AART	Prompts	Mixed	3269	100%	aart
OpenAI Moderation Dataset	Prompts	Mixed	1680	31%	openai_moderation_dataset
SimpleSafetyTests	Prompts	Mixed	100	100%	simple_safety_tests
Toxic Chat	Prompts	Mixed	5083	7%	toxic_chat
BeaverTails 330k	Conversations	Single-Turn	11088	55%	beaver_tails_330k
Bot-Adversarial Dialogue	Conversations	Multi-Turn	2598	36%	bot_adversarial_dialogue
ConvAbuse	Conversations	Multi-Turn	853	15%	convabuse
DICES 350	Conversations	Multi-Turn	350	23%	dices_350
DICES 990	Conversations	Multi-Turn	990	16%	dices_990
HarmfulQA	Conversations	Multi-Turn	16459	45%	harmful_qa
ProsocialDialog	Conversations	Multi-Turn	25029	60%	prosocial_dialog

AdvBench Behaviors

Language(s): English
Source(s): Machine-generated
License: MIT
Alias: advbench_behaviors

BibTeX

@article{advbench,
  author       = {Andy Zou and
                  Zifan Wang and
                  J. Zico Kolter and
                  Matt Fredrikson},
  title        = {Universal and Transferable Adversarial Attacks on Aligned Language
                  Models},
  journal      = {CoRR},
  volume       = {abs/2307.15043},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2307.15043},
  doi          = {10.48550/ARXIV.2307.15043},
  eprinttype    = {arXiv},
  eprint       = {2307.15043},
  timestamp    = {Mon, 28 Aug 2023 21:26:19 +0200},
}

HarmBench Behaviors

Language(s): English
Source(s): Human-generated
License: MIT
Alias: harmbench_behaviors

BibTeX

@article{harmbench_behaviors,
  author       = {Mantas Mazeika and
                  Long Phan and
                  Xuwang Yin and
                  Andy Zou and
                  Zifan Wang and
                  Norman Mu and
                  Elham Sakhaee and
                  Nathaniel Li and
                  Steven Basart and
                  Bo Li and
                  David A. Forsyth and
                  Dan Hendrycks},
  title        = {HarmBench: {A} Standardized Evaluation Framework for Automated Red
                  Teaming and Robust Refusal},
  journal      = {CoRR},
  volume       = {abs/2402.04249},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2402.04249},
  doi          = {10.48550/ARXIV.2402.04249},
  eprinttype    = {arXiv},
  eprint       = {2402.04249},
  timestamp    = {Mon, 12 Feb 2024 17:35:51 +0100},
}

I-CoNa

Language(s): English
Source(s): Human-generated
License: CC BY-NC 4.0
Alias: i_cona

BibTeX

@article{i_datasets,
  author       = {Federico Bianchi and
                  Mirac Suzgun and
                  Giuseppe Attanasio and
                  Paul R{"{o}}ttger and
                  Dan Jurafsky and
                  Tatsunori Hashimoto and
                  James Zou},
  title        = {Safety-Tuned LLaMAs: Lessons From Improving the Safety of Large Language
                  Models that Follow Instructions},
  journal      = {CoRR},
  volume       = {abs/2309.07875},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2309.07875},
  doi          = {10.48550/ARXIV.2309.07875},
  eprinttype    = {arXiv},
  eprint       = {2309.07875},
  timestamp    = {Wed, 20 Sep 2023 11:04:35 +0200},
}

I-Controversial

Language(s): English
Source(s): Human-generated
License: CC BY-NC 4.0
Alias: i_controversial

BibTeX

@article{i_datasets,
  author       = {Federico Bianchi and
                  Mirac Suzgun and
                  Giuseppe Attanasio and
                  Paul R{"{o}}ttger and
                  Dan Jurafsky and
                  Tatsunori Hashimoto and
                  James Zou},
  title        = {Safety-Tuned LLaMAs: Lessons From Improving the Safety of Large Language
                  Models that Follow Instructions},
  journal      = {CoRR},
  volume       = {abs/2309.07875},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2309.07875},
  doi          = {10.48550/ARXIV.2309.07875},
  eprinttype    = {arXiv},
  eprint       = {2309.07875},
  timestamp    = {Wed, 20 Sep 2023 11:04:35 +0200},
}

I-MaliciousInstructions

Language(s): English
Source(s): Machine-generated
License: CC BY-NC 4.0
Alias: i_malicious_instructions

BibTeX

@article{i_datasets,
  author       = {Federico Bianchi and
                  Mirac Suzgun and
                  Giuseppe Attanasio and
                  Paul R{"{o}}ttger and
                  Dan Jurafsky and
                  Tatsunori Hashimoto and
                  James Zou},
  title        = {Safety-Tuned LLaMAs: Lessons From Improving the Safety of Large Language
                  Models that Follow Instructions},
  journal      = {CoRR},
  volume       = {abs/2309.07875},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2309.07875},
  doi          = {10.48550/ARXIV.2309.07875},
  eprinttype    = {arXiv},
  eprint       = {2309.07875},
  timestamp    = {Wed, 20 Sep 2023 11:04:35 +0200},
}

I-Physical-Safety

Language(s): English
Source(s): Human-generated
License: CC BY-NC 4.0
Alias: i_physical_safety

BibTeX

@article{i_datasets,
  author       = {Federico Bianchi and
                  Mirac Suzgun and
                  Giuseppe Attanasio and
                  Paul R{"{o}}ttger and
                  Dan Jurafsky and
                  Tatsunori Hashimoto and
                  James Zou},
  title        = {Safety-Tuned LLaMAs: Lessons From Improving the Safety of Large Language
                  Models that Follow Instructions},
  journal      = {CoRR},
  volume       = {abs/2309.07875},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2309.07875},
  doi          = {10.48550/ARXIV.2309.07875},
  eprinttype    = {arXiv},
  eprint       = {2309.07875},
  timestamp    = {Wed, 20 Sep 2023 11:04:35 +0200},
}

MaliciousInstruct

Language(s): English
Source(s): Machine-generated
License: MIT
Alias: malicious_instruct

BibTeX

@article{malicious_instruct,
  author       = {Yangsibo Huang and
                  Samyak Gupta and
                  Mengzhou Xia and
                  Kai Li and
                  Danqi Chen},
  title        = {Catastrophic Jailbreak of Open-source LLMs via Exploiting Generation},
  journal      = {CoRR},
  volume       = {abs/2310.06987},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2310.06987},
  doi          = {10.48550/ARXIV.2310.06987},
  eprinttype    = {arXiv},
  eprint       = {2310.06987},
  timestamp    = {Thu, 14 Dec 2023 18:03:42 +0100},
}

MITRE

Language(s): English
Source(s): Human-generated
License: MIT
Alias: mitre

BibTeX

@article{mitre,
  author       = {Manish Bhatt and
                  Sahana Chennabasappa and
                  Yue Li and
                  Cyrus Nikolaidis and
                  Daniel Song and
                  Shengye Wan and
                  Faizan Ahmad and
                  Cornelius Aschermann and
                  Yaohui Chen and
                  Dhaval Kapil and
                  David Molnar and
                  Spencer Whitman and
                  Joshua Saxe},
  title        = {CyberSecEval 2: A Wide-Ranging Cybersecurity Evaluation Suite for Large Language Models},
  journal      = {CoRR},
  volume       = {abs/2404.13161},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2404.13161},
  doi          = {10.48550/ARXIV.2404.13161},
  eprinttype    = {arXiv},
  eprint       = {2404.13161},
  timestamp    = {Fri, 19 Apr 2024 11:04:35 +0200},
}

StrongREJECT Instructions

Language(s): English
Source(s): Human-generated
License: N/A
Alias: strong_reject_instructions

BibTeX

@article{strong_reject_instructions,
  author       = {Alexandra Souly and
                  Qingyuan Lu and
                  Dillon Bowen and
                  Tu Trinh and
                  Elvis Hsieh and
                  Sana Pandey and
                  Pieter Abbeel and
                  Justin Svegliato and
                  Scott Emmons and
                  Olivia Watkins and
                  Sam Toyer},
  title        = {A StrongREJECT for Empty Jailbreaks},
  journal      = {CoRR},
  volume       = {abs/2402.10260},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2402.
10260},
  doi          = {10.48550/ARXIV.2402.10260},
  eprinttype   = {arXiv},
  eprint       = {2402.10260},
  timestamp    = {Mon, 26 Feb 2024 16:52:48 +0100},
}

TDCRedTeaming

Language(s): English
Source(s): Human-generated
License: MIT
Alias: tdc_red_teaming

BibTeX

@inproceedings{tdc_red_teaming,
  author       = {Mantas Mazeika and
                  Andy Zou and
                  Norman Mu and
                  Long Phan and
                  Zifan Wang and
                  Chunru Yu and
                  Adam Khoja and
                  Fengqing Jiang and
                  Aidan O'Gara and
                  Ellie Sakhaee and
                  Zhen Xiang and
                  Arezoo Rajabi and
                  Dan Hendrycks and
                  Radha Poovendran and
                  Bo Li and
                  David Forsyth},
  title        = {TDC 2023 (LLM Edition): The Trojan Detection 
Challenge},
  booktitle    = {NeurIPS Competition Track},
  year         = {2023},
}

CatQA

Language(s): English
Source(s): Machine-generated
License: Apache 2.0
Alias: cat_qa

BibTeX

@article{cat_qa,
  author       = {Rishabh Bhardwaj and
                  Do Duc Anh and
                  Soujanya Poria},
  title        = {Language Models are Homer Simpson! Safety Re-Alignment of Fine-tuned
                  Language Models through Task Arithmetic},
  journal      = {CoRR},
  volume       = {abs/2402.11746},
  year         = {2024},
  url          = {https://doi.org/10.48550/arXiv.2402.11746},
  doi          = {10.48550/ARXIV.2402.11746},
  eprinttype    = {arXiv},
  eprint       = {2402.11746},
  timestamp    = {Mon, 26 Feb 2024 16:52:48 +0100},
}

Do Anything Now Questions

Language(s): English
Source(s): Discord, Reddit, AIPRM, FlowGPT, JailbreakChat, Awesome ChatGPT Prompts, OCR-Prompts
License: MIT
Alias: do_anything_now_questions

BibTeX

@article{do_anything_now,
  author       = {Xinyue Shen and
                  Zeyuan Chen and
                  Michael Backes and
                  Yun Shen and
                  Yang Zhang},
  title        = {"Do Anything Now": Characterizing and Evaluating In-The-Wild Jailbreak
                  Prompts on Large Language Models},
  journal      = {CoRR},
  volume       = {abs/2308.03825},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2308.03825},
  doi          = {10.48550/ARXIV.2308.03825},
  eprinttype    = {arXiv},
  eprint       = {2308.03825},
  timestamp    = {Fri, 27 Oct 2023 08:36:05 +0200},
}

DoNotAnswer

Language(s): English
Source(s): Machine-generated
License: Apache 2.0
Alias: do_not_answer

BibTeX

@inproceedings{do_not_answer,
  author       = {Yuxia Wang and
                  Haonan Li and
                  Xudong Han and
                  Preslav Nakov and
                  Timothy Baldwin},
  title        = {Do-Not-Answer: Evaluating Safeguards in LLMs},
  booktitle    = {Findings of the Association for Computational Linguistics: {EACL}
                  2024, St. Julian's, Malta, March 17-22, 2024},
  pages        = {896--911},
  publisher    = {Association for Computational Linguistics},
  year         = {2024},
  url          = {https://aclanthology.org/2024.findings-eacl.61},
  timestamp    = {Tue, 02 Apr 2024 16:32:10 +0200},
}

HarmfulQ

Language(s): English
Source(s): Machine-generated
License: CC BY SA 4.0
Alias: harmful_q

BibTeX

@inproceedings{harmful_q,
  author       = {Omar Shaikh and
                  Hongxin Zhang and
                  William Held and
                  Michael S. Bernstein and
                  Diyi Yang},
  title        = {On Second Thought, Let's Not Think Step by Step! Bias and Toxicity
                  in Zero-Shot Reasoning},
  booktitle    = {Proceedings of the 61st Annual Meeting of the Association for Computational
                  Linguistics (Volume 1: Long Papers), {ACL} 2023, Toronto, Canada,
                  July 9-14, 2023},
  pages        = {4454--4470},
  publisher    = {Association for Computational Linguistics},
  year         = {2023},
  url          = {https://doi.org/10.18653/v1/2023.acl-long.244},
  doi          = {10.18653/V1/2023.ACL-LONG.244},
  timestamp    = {Mon, 25 Sep 2023 15:32:28 +0200},
}

HarmfulQA Questions

Language(s): English
Source(s): Machine-generated
License: Apache 2.0
Alias: harmful_qa_questions

BibTeX

@article{harmful_qa,
  author       = {Rishabh Bhardwaj and
                  Soujanya Poria},
  title        = {Red-Teaming Large Language Models using Chain of Utterances for Safety-Alignment},
  journal      = {CoRR},
  volume       = {abs/2308.09662},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2308.09662},
  doi          = {10.48550/ARXIV.2308.09662},
  eprinttype    = {arXiv},
  eprint       = {2308.09662},
  timestamp    = {Fri, 25 Aug 2023 11:24:49 +0200},
}

HEx-PHI

Language(s): English
Source(s): Human-generated
License: https://huggingface.co/datasets/LLM-Tuning-Safety/HEx-PHI#hex-phi-dataset-license-agreement
Alias: hex_phi

BibTeX

@article{hex_phi,
  author       = {Xiangyu Qi and
                  Yi Zeng and
                  Tinghao Xie and
                  Pin{-}Yu Chen and
                  Ruoxi Jia and
                  Prateek Mittal and
                  Peter Henderson},
  title        = {Fine-tuning Aligned Language Models Compromises Safety, Even When
                  Users Do Not Intend To!},
  journal      = {CoRR},
  volume       = {abs/2310.03693},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2310.03693},
  doi          = {10.48550/ARXIV.2310.03693},
  eprinttype    = {arXiv},
  eprint       = {2310.03693},
  timestamp    = {Wed, 20 Mar 2024 11:34:12 +0100},
}

XSTest

Language(s): English
Source(s): Human-generated
License: CC BY 4.0
Alias: xstest

BibTeX

@article{xstest,
  author       = {Paul R{"{o}}ttger and
                  Hannah Rose Kirk and
                  Bertie Vidgen and
                  Giuseppe Attanasio and
                  Federico Bianchi and
                  Dirk Hovy},
  title        = {XSTest: {A} Test Suite for Identifying Exaggerated Safety Behaviours
                  in Large Language Models},
  journal      = {CoRR},
  volume       = {abs/2308.01263},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2308.01263},
  doi          = {10.48550/ARXIV.2308.01263},
  eprinttype   = {arXiv},
  eprint       = {2308.01263},
  timestamp    = {Mon, 21 Aug 2023 17:38:10 +0200},
}

AdvBench Strings

Language(s): English
Source(s): Machine-generated
License: MIT
Alias: advbench_strings

BibTeX

@article{advbench,
  author       = {Andy Zou and
                  Zifan Wang and
                  J. Zico Kolter and
                  Matt Fredrikson},
  title        = {Universal and Transferable Adversarial Attacks on Aligned Language
                  Models},
  journal      = {CoRR},
  volume       = {abs/2307.15043},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2307.15043},
  doi          = {10.48550/ARXIV.2307.15043},
  eprinttype    = {arXiv},
  eprint       = {2307.15043},
  timestamp    = {Mon, 28 Aug 2023 21:26:19 +0200},
}

DecodingTrust Stereotypes

Language(s): English
Source(s): Human-generated
License: CC BY SA 4.0
Alias: decoding_trust_stereotypes

BibTeX

@inproceedings{decoding_trust_stereotypes,
  author       = {Boxin Wang and
                  Weixin Chen and
                  Hengzhi Pei and
                  Chulin Xie and
                  Mintong Kang and
                  Chenhui Zhang and
                  Chejian Xu and
                  Zidi Xiong and
                  Ritik Dutta and
                  Rylan Schaeffer and
                  Sang T. Truong and
                  Simran Arora and
                  Mantas Mazeika and
                  Dan Hendrycks and
                  Zinan Lin and
                  Yu Cheng and
                  Sanmi Koyejo and
                  Dawn Song and
                  Bo Li},
  title        = {DecodingTrust: {A} Comprehensive Assessment of Trustworthiness in
                  {GPT} Models},
  booktitle    = {Advances in Neural Information Processing Systems 36: Annual Conference
                  on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans,
                  LA, USA, December 10 - 16, 2023},
  year         = {2023},
  url          = {http://papers.nips.cc/paper\_files/paper/2023/hash/
63cb9921eecf51bfad27a99b2c53dd6d-Abstract-Datasets\_and\_Benchmarks.html},
  timestamp    = {Tue, 02 Apr 2024 16:32:39 +0200},
}

DynaHate

Language(s): English
Source(s): Human-generated
License: Apache 2.0
Alias: dynahate

BibTeX

@inproceedings{dynahate,
  author       = {Bertie Vidgen and
                  Tristan Thrush and
                  Zeerak Waseem and
                  Douwe Kiela},
  title        = {Learning from the Worst: Dynamically Generated Datasets to Improve
                  Online Hate Detection},
  booktitle    = {Proceedings of the 59th Annual Meeting of the Association for Computational
                  Linguistics and the 11th International Joint Conference on Natural
                  Language Processing, {ACL/IJCNLP} 2021, (Volume 1: Long Papers), Virtual
                  Event, August 1-6, 2021},
  pages        = {1667--1682},
  publisher    = {Association for Computational Linguistics},
  year         = {2021},
  url          = {https://doi.org/10.18653/v1/2021.acl-long.132},
  doi          = {10.18653/V1/2021.ACL-LONG.132},
  timestamp    = {Mon, 09 Aug 2021 16:25:37 +0200},
}

HateCheck

Language(s): English
Source(s): Algorithmically generated
License: CC BY 4.0
Alias: hatecheck

BibTeX

@inproceedings{hatecheck,
  author       = {Paul R{"{o}}ttger and
                  Bertie Vidgen and
                  Dong Nguyen and
                  Zeerak Waseem and
                  Helen Z. Margetts and
                  Janet B. Pierrehumbert},
  title        = {HateCheck: Functional Tests for Hate Speech Detection Models},
  booktitle    = {Proceedings of the 59th Annual Meeting of the Association for Computational
                  Linguistics and the 11th International Joint Conference on Natural
                  Language Processing, {ACL/IJCNLP} 2021, (Volume 1: Long Papers), Virtual
                  Event, August 1-6, 2021},
  pages        = {41--58},
  publisher    = {Association for Computational Linguistics},
  year         = {2021},
  url          = {https://doi.org/10.18653/v1/2021.acl-long.4},
  doi          = {10.18653/V1/2021.ACL-LONG.4},
  timestamp    = {Mon, 05 Feb 2024 20:27:00 +0100},
}

Hatemoji Check

Language(s): English
Source(s): Algorithmically generated
License: CC BY 4.0
Alias: hatemoji_check

BibTeX

@inproceedings{hatemoji,
  author       = {Hannah Kirk and
                  Bertie Vidgen and
                  Paul R{"{o}}ttger and
                  Tristan Thrush and
                  Scott A. Hale},
  title        = {Hatemoji: {A} Test Suite and Adversarially-Generated Dataset for Benchmarking
                  and Detecting Emoji-Based Hate},
  booktitle    = {Proceedings of the 2022 Conference of the North American Chapter of
                  the Association for Computational Linguistics: Human Language Technologies,
                  {NAACL} 2022, Seattle, WA, United States, July 10-15, 2022},
  pages        = {1352--1368},
  publisher    = {Association for Computational Linguistics},
  year         = {2022},
  url          = {https://doi.org/10.18653/v1/2022.naacl-main.97},
  doi          = {10.18653/V1/2022.NAACL-MAIN.97},
  timestamp    = {Mon, 26 Jun 2023 20:46:58 +0200},
}

SafeText

Language(s): English
Source(s): Human-generated
License: MIT
Alias: safe_text

BibTeX

@inproceedings{safe_text,
  author       = {Sharon Levy and
                  Emily Allaway and
                  Melanie Subbiah and
                  Lydia B. Chilton and
                  Desmond Patton and
                  Kathleen R. McKeown and
                  William Yang Wang},
  title        = {SafeText: {A} Benchmark for Exploring Physical Safety in Language
                  Models},
  booktitle    = {Proceedings of the 2022 Conference on Empirical Methods in Natural
                  Language Processing, {EMNLP} 2022, Abu Dhabi, United Arab Emirates,
                  December 7-11, 2022},
  pages        = {2407--2421},
  publisher    = {Association for Computational Linguistics},
  year         = {2022},
  url          = {https://doi.org/10.18653/v1/2022.emnlp-main.154},
  doi          = {10.18653/V1/2022.EMNLP-MAIN.154},
  timestamp    = {Thu, 10 Aug 2023 12:35:27 +0200},
}

ToxiGen

Language(s): English
Source(s): Machine-generated
License: MIT
Alias: toxigen

BibTeX

@inproceedings{toxigen,
  title        = "{T}oxi{G}en: A Large-Scale Machine-Generated Dataset for Adversarial and Implicit Hate Speech Detection",
  author       = "Hartvigsen, Thomas  and
                 Gabriel, Saadia  and
                 Palangi, Hamid  and
                 Sap, Maarten  and
                 Ray, Dipankar  and
                 Kamar, Ece",
  editor       = "Muresan, Smaranda  and
                  Nakov, Preslav  and
                  Villavicencio, Aline",
  booktitle    = "Proceedings of the 60th Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers)",
  month        = may,
  year         = "2022",
  address      = "Dublin, Ireland",
  publisher    = "Association for Computational Linguistics",
  url          = "https://aclanthology.org/2022.acl-long.234",
  doi          = "10.18653/v1/2022.acl-long.234",
  pages        = "3309--3326",
}

AART

Language(s): English
Source(s): Machine-generated
License: CC BY 4.0
Alias: aart

BibTeX

@inproceedings{aart,
  author       = {Bhaktipriya Radharapu and
                  Kevin Robinson and
                  Lora Aroyo and
                  Preethi Lahoti},
  title        = {{AART:} AI-Assisted Red-Teaming with Diverse Data Generation for New
                  LLM-powered Applications},
  booktitle    = {Proceedings of the 2023 Conference on Empirical Methods in Natural
                  Language Processing: {EMNLP} 2023 - Industry Track, Singapore, December
                  6-10, 2023},
  pages        = {380--395},
  publisher    = {Association for Computational Linguistics},
  year         = {2023},
  url          = {https://doi.org/10.18653/v1/2023.emnlp-industry.37},
  doi          = {10.18653/V1/2023.EMNLP-INDUSTRY.37},
  timestamp    = {Fri, 12 Apr 2024 13:11:30 +0200},
}

OpenAI Moderation Dataset

Language(s): English
Source(s): Human-generated
License: MIT
Alias: openai_moderation_dataset

BibTeX

@inproceedings{openai_moderation_dataset,
  author       = {Todor Markov and
                  Chong Zhang and
                  Sandhini Agarwal and
                  Florentine Eloundou Nekoul and
                  Theodore Lee and
                  Steven Adler and
                  Angela Jiang and
                  Lilian Weng},
  title        = {A Holistic Approach to Undesired Content Detection in the Real World},
  booktitle    = {Thirty-Seventh {AAAI} Conference on Artificial Intelligence, {AAAI}
                  2023, Thirty-Fifth Conference on Innovative Applications of Artificial
                  Intelligence, {IAAI} 2023, Thirteenth Symposium on Educational Advances
                  in Artificial Intelligence, {EAAI} 2023, Washington, DC, USA, February
                  7-14, 2023},
  pages        = {15009--15018},
  publisher    = {{AAAI} Press},
  year         = {2023},
  url          = {https://doi.org/10.1609/aaai.v37i12.26752},
  doi          = {10.1609/AAAI.V37I12.26752},
  timestamp    = {Mon, 04 Sep 2023 16:50:26 +0200},
}

SimpleSafetyTests

Language(s): English
Source(s): Human-generated
License: CC BY 4.0
Alias: simple_safety_tests

BibTeX

@article{simple_safety_tests,
  author       = {Bertie Vidgen and
                  Hannah Rose Kirk and
                  Rebecca Qian and
                  Nino Scherrer and
                  Anand Kannappan and
                  Scott A. Hale and
                  Paul R{"{o}}ttger},
  title        = {SimpleSafetyTests: a Test Suite for Identifying Critical Safety Risks
                  in Large Language Models},
  journal      = {CoRR},
  volume       = {abs/2311.08370},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2311.08370},
  doi          = {10.48550/ARXIV.2311.08370},
  eprinttype    = {arXiv},
  eprint       = {2311.08370},
  timestamp    = {Tue, 21 Nov 2023 13:55:21 +0100},
}

Toxic Chat

Language(s): English
Source(s): Vicuna online demo data
License: CC BY-NC 4.0
Alias: toxic_chat

BibTeX

@inproceedings{toxic_chat,
  author       = {Zi Lin and
                  Zihan Wang and
                  Yongqi Tong and
                  Yangkun Wang and
                  Yuxin Guo and
                  Yujia Wang and
                  Jingbo Shang},
  title        = {ToxicChat: Unveiling Hidden Challenges of Toxicity Detection in Real-World
                  User-AI Conversation},
  booktitle    = {Findings of the Association for Computational Linguistics: {EMNLP}
                  2023, Singapore, December 6-10, 2023},
  pages        = {4694--4702},
  publisher    = {Association for Computational Linguistics},
  year         = {2023},
  url          = {https://doi.org/10.18653/v1/2023.findings-emnlp.311},
  doi          = {10.18653/V1/2023.FINDINGS-EMNLP.311},
  timestamp    = {Fri, 12 Apr 2024 13:11:31 +0200},
}

BeaverTails 330k

Language(s): English
Source(s): Human-AI conversations / manually labelled
License: MIT
Alias: beaver_tails_330k

BibTeX

@inproceedings{beaver_tails,
  author       = {Jiaming Ji and
                  Mickel Liu and
                  Josef Dai and
                  Xuehai Pan and
                  Chi Zhang and
                  Ce Bian and
                  Boyuan Chen and
                  Ruiyang Sun and
                  Yizhou Wang and
                  Yaodong Yang},
  title        = {BeaverTails: Towards Improved Safety Alignment of {LLM} via a Human-Preference
                  Dataset},
  booktitle    = {Advances in Neural Information Processing Systems 36: Annual Conference
                  on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans,
                  LA, USA, December 10 - 16, 2023},
  year         = {2023},
  url          = {http://papers.nips.cc/paper\_files/paper/2023/hash/4dbb61cb68671edc4ca3712d70083b9f-Abstract-Datasets\_and\_Benchmarks.html},
  timestamp    = {Fri, 01 Mar 2024 16:26:20 +0100},
}

Bot-Adversarial Dialogue

Language(s): English
Source(s): real-world
License: Apache 2.0
Alias: bot_adversarial_dialogue

BibTeX

@inproceedings{bot_adversarial_dialogue,
  author       = {Jing Xu and
                  Da Ju and
                  Margaret Li and
                  Y{-}Lan Boureau and
                  Jason Weston and
                  Emily Dinan},
  title        = {Bot-Adversarial Dialogue for Safe Conversational Agents},
  booktitle    = {Proceedings of the 2021 Conference of the North American Chapter of
                  the Association for Computational Linguistics: Human Language Technologies,
                  {NAACL-HLT} 2021, Online, June 6-11, 2021},
  pages        = {2950--2968},
  publisher    = {Association for Computational Linguistics},
  year         = {2021},
  url          = {https://doi.org/10.18653/v1/2021.naacl-main.235},
  doi          = {10.18653/V1/2021.NAACL-MAIN.235},
  timestamp    = {Fri, 06 Aug 2021 00:41:31 +0200},
}

ConvAbuse

Language(s): English
Source(s): Human-AI conversations
License: CC BY 4.0
Alias: convabuse

BibTeX

@inproceedings{convabuse,
  author       = {Amanda Cercas Curry and
                  Gavin Abercrombie and
                  Verena Rieser},
  title        = {ConvAbuse: Data, Analysis, and Benchmarks for Nuanced Detection in
                  Conversational {AI}},
  booktitle    = {Proceedings of the 2021 Conference on Empirical Methods in Natural
                  Language Processing, {EMNLP} 2021, Virtual Event / Punta Cana, Dominican
                  Republic, 7-11 November, 2021},
  pages        = {7388--7403},
  publisher    = {Association for Computational Linguistics},
  year         = {2021},
  url          = {https://doi.org/10.18653/v1/2021.emnlp-main.587},
  doi          = {10.18653/V1/2021.EMNLP-MAIN.587},
  timestamp    = {Fri, 16 Feb 2024 08:27:36 +0100},
}

DICES 350

Language(s): English
Source(s): Human-AI conversations
License: CC BY 4.0
Alias: dices_350

BibTeX

@inproceedings{dices,
  author       = {Lora Aroyo and
                  Alex S. Taylor and
                  Mark D{'{\i}}az and
                  Christopher Homan and
                  Alicia Parrish and
                  Gregory Serapio{-}Garc{'{\i}}a and
                  Vinodkumar Prabhakaran and
                  Ding Wang},
  title        = {{DICES} Dataset: Diversity in Conversational {AI} Evaluation for Safety},
  booktitle    = {Advances in Neural Information Processing Systems 36: Annual Conference
                  on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans,
                  LA, USA, December 10 - 16, 2023},
  year         = {2023},
  url          = {http://papers.nips.cc/paper\_files/paper/2023/hash/a74b697bce4cac6c91896372abaa8863-Abstract-Datasets\_and\_Benchmarks.html},
  timestamp    = {Fri, 01 Mar 2024 16:26:20 +0100},
}

DICES 990

Language(s): English
Source(s): Human-AI conversations
License: CC BY 4.0
Alias: dices_990

BibTeX

@inproceedings{dices,
  author       = {Lora Aroyo and
                  Alex S. Taylor and
                  Mark D{'{\i}}az and
                  Christopher Homan and
                  Alicia Parrish and
                  Gregory Serapio{-}Garc{'{\i}}a and
                  Vinodkumar Prabhakaran and
                  Ding Wang},
  title        = {{DICES} Dataset: Diversity in Conversational {AI} Evaluation for Safety},
  booktitle    = {Advances in Neural Information Processing Systems 36: Annual Conference
                  on Neural Information Processing Systems 2023, NeurIPS 2023, New Orleans,
                  LA, USA, December 10 - 16, 2023},
  year         = {2023},
  url          = {http://papers.nips.cc/paper\_files/paper/2023/hash/a74b697bce4cac6c91896372abaa8863-Abstract-Datasets\_and\_Benchmarks.html},
  timestamp    = {Fri, 01 Mar 2024 16:26:20 +0100},
}

HarmfulQA

Language(s): English
Source(s): Machine-generated
License: Apache 2.0
Alias: harmful_qa

BibTeX

@article{harmful_qa,
  author       = {Rishabh Bhardwaj and
                  Soujanya Poria},
  title        = {Red-Teaming Large Language Models using Chain of Utterances for Safety-Alignment},
  journal      = {CoRR},
  volume       = {abs/2308.09662},
  year         = {2023},
  url          = {https://doi.org/10.48550/arXiv.2308.09662},
  doi          = {10.48550/ARXIV.2308.09662},
  eprinttype    = {arXiv},
  eprint       = {2308.09662},
  timestamp    = {Fri, 25 Aug 2023 11:24:49 +0200},
}

ProsocialDialog

Language(s): English
Source(s): Human-Machine-generated
License: CC BY 4.0
Alias: prosocial_dialog

BibTeX

@inproceedings{prosocial_dialog,
  author       = {Hyunwoo Kim and
                  Youngjae Yu and
                  Liwei Jiang and
                  Ximing Lu and
                  Daniel Khashabi and
                  Gunhee Kim and
                  Yejin Choi and
                  Maarten Sap},
  title        = {ProsocialDialog: {A} Prosocial Backbone for Conversational Agents},
  booktitle    = {Proceedings of the 2022 Conference on Empirical Methods in Natural
                  Language Processing, {EMNLP} 2022, Abu Dhabi, United Arab Emirates,
                  December 7-11, 2022},
  pages        = {4005--4029},
  publisher    = {Association for Computational Linguistics},
  year         = {2022},
  url          = {https://doi.org/10.18653/v1/2022.emnlp-main.267},
  doi          = {10.18653/V1/2022.EMNLP-MAIN.267},
  timestamp    = {Thu, 10 Aug 2023 12:35:27 +0200},
}

Files

datasets.md

Latest commit

History

datasets.md

File metadata and controls

Datasets

AdvBench Behaviors

HarmBench Behaviors

I-CoNa

I-Controversial

I-MaliciousInstructions

I-Physical-Safety

MaliciousInstruct

MITRE

StrongREJECT Instructions

TDCRedTeaming

CatQA

Do Anything Now Questions

DoNotAnswer

HarmfulQ

HarmfulQA Questions

HEx-PHI

XSTest

AdvBench Strings

DecodingTrust Stereotypes

DynaHate

HateCheck

Hatemoji Check

SafeText

ToxiGen

AART

OpenAI Moderation Dataset

SimpleSafetyTests

Toxic Chat

BeaverTails 330k

Bot-Adversarial Dialogue

ConvAbuse

DICES 350

DICES 990

HarmfulQA

ProsocialDialog