-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathconditioning.bib
1101 lines (941 loc) · 51.1 KB
/
conditioning.bib
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
@techreport{11_proposals,
title = {An overview of 11 proposals for building safe advanced {AI}},
url = {http://arxiv.org/abs/2012.07532},
abstract = {This paper analyzes and compares 11 different proposals for building safe advanced AI under the current machine learning paradigm, including major contenders such as iterated amplification, AI safety via debate, and recursive reward modeling. Each proposal is evaluated on the four components of outer alignment, inner alignment, training competitiveness, and performance competitiveness, of which the distinction between the latter two is introduced in this paper. While prior literature has primarily focused on analyzing individual proposals, or primarily focused on outer alignment at the expense of inner alignment, this analysis seeks to take a comparative look at a wide range of proposals including a comparative analysis across all four previously mentioned components.},
language = {en},
urldate = {2023-01-31},
institution = {arXiv},
author = {Hubinger, Evan},
month = dec,
year = {2020},
}
@article{amplification,
author={Paul Christiano and Buck Shlegeris and Dario Amodei},
title={Supervising strong learners by amplifying weak experts},
journal={arXiv},
year=2018,
url={https://arxiv.org/abs/1810.08575},
}
@article{debate,
author={Geoffrey Irving and Paul Christiano and Dario Amodei},
title={{AI} safety via debate},
journal={arXiv},
year=2018,
url={https://arxiv.org/abs/1805.00899},
}
@article{leike,
author={Jan Leike and David Krueger and Tom Everitt and Miljan Martic and Vishal Maini and Shane Legg},
title={Scalable agent alignment via reward modeling: a research direction},
journal={arXiv},
year=2018,
url={https://arxiv.org/abs/1811.07871},
}
@article{risks,
author={Evan Hubinger and Chris van Merwijk and Vladimir Mikulika and Joar Skalse and Scott Garrabrant},
title={{Risks from Learned Optimization in Advanced Machine Learning Systems}},
journal={arXiv},
year=2019,
url={https://arxiv.org/abs/1906.01820},
}
@book{superintelligence,
author={Nick Bostrom},
title={Superintelligence: Paths, Dangers, Strategies},
publisher={Oxford University Press},
year=2014,
url={https://global.oup.com/academic/product/superintelligence-9780199678112?cc=us&lang=en&},
}
@misc{outer_alignment,
author={Evan Hubinger},
title={Outer alignment and imitative amplification},
year=2020,
url={https://www.alignmentforum.org/posts/33EKjmAdKFn3pbKPJ/outer-alignment-and-imitative-amplification},
}
@misc{market_making,
author={Evan Hubinger},
title={{AI} safety via market making},
year=2020,
url={https://www.alignmentforum.org/posts/YWwzccGbcHMJMpT45/ai-safety-via-market-making},
}
@article{tool_use,
author={Bowen Baker and Ingmar Kanitscheider and Todor Markov and Yi Wu and Glenn Powell and Bob McGrew and Igor Mordatch},
title={{Emergent Tool Use From Multi-Agent Autocurricula}},
journal={arXiv},
year=2019,
url={https://arxiv.org/abs/1909.07528},
}
@misc{multi_agent_safety,
author={Richard Ngo},
title={Multi-agent safety},
year=2020,
url={https://www.alignmentforum.org/posts/BXMCgpktdiawT3K5v/multi-agent-safety},
}
@article{circuits,
author={Chris Olah and Nick Cammarata and Ludwig Schubert and Gabriel Goh and Michael Petrov and Shan Carter},
title={{Thread: Circuits}},
journal={Distill},
year=2020,
url={https://distill.pub/2020/circuits/},
}
@misc{catastrophes,
author={Paul Christiano},
title={Learning with catastrophes},
year=2016,
url={https://ai-alignment.com/learning-with-catastrophes-59387b55cc30},
}
@misc{chris_olah,
author={Evan Hubinger},
title={{Chris Olah’s views on AGI safety}},
year=2019,
url={https://www.alignmentforum.org/posts/X2i9dQQK3gETCyqh2/chris-olah-s-views-on-agi-safety},
}
@misc{adversarial_ida,
author={Evan Hubinger},
title={{A Concrete Proposal for Adversarial IDA}},
year=2019,
url={https://www.alignmentforum.org/posts/jYvm4mmjvGHcPXtGL/a-concrete-proposal-for-adversarial-ida},
}
@misc{strong_hch,
author={Paul Christiano},
title={Strong {HCH}},
year=2016,
url={https://ai-alignment.com/strong-hch-bedb0dc08d4e},
}
@misc{universality,
author={Paul Christiano},
title={Universality and consequentialism within {HCH}},
year=2019,
url={https://ai-alignment.com/universality-and-consequentialism-within-hch-c0bee00365bd},
}
@misc{mechanistic,
author={Evan Hubinger},
title={Towards a mechanistic understanding of corrigibility},
year=2019,
url={https://www.alignmentforum.org/posts/BKM8uQS6QdJPZLqCr/towards-a-mechanistic-understanding-of-corrigibility},
}
@misc{efficient_feedback,
author={Paul Christiano},
title={Efficient feedback},
year=2015,
url={https://ai-alignment.com/efficient-feedback-a347748b1557},
}
@misc{relaxed,
author={Evan Hubinger},
title={Relaxed adversarial training for inner alignment},
year=2019,
url={https://www.alignmentforum.org/posts/9Dy5YRaoCxH9zuJqa/relaxed-adversarial-training-for-inner-alignment},
}
@misc{gradient_hacking,
author={Evan Hubinger},
title={Gradient hacking},
year=2019,
url={https://www.alignmentforum.org/posts/uXH4r6MmKPedk8rMA/gradient-hacking},
}
@article{deep_tamer,
author={Garrett Warnell and Nicholas Waytowich and Vernon Lawhern and Peter Stone},
title={{Deep TAMER: Interactive Agent Shaping in High-Dimensional State Spaces}},
journal={arXiv},
year=2017,
url={https://arxiv.org/abs/1709.10163},
}
@article{deep_rl,
author={Dilip Arumugam and Jun Ki Lee and Sophie Saskin and Michael L. Littman},
title={{Deep Reinforcement Learning from Policy-Dependent Human Feedback}},
journal={arXiv},
year=2019,
url={https://arxiv.org/abs/1902.04257},
}
@misc{model_free,
author={Paul Christiano},
title={Approval-directed agents},
year=2014,
url={https://ai-alignment.com/model-free-decisions-6e6609f5d99e},
}
@misc{visualizing,
author={Chris Olah},
title={{Visualizing Representations: Deep Learning and Human Beings}},
year=2015,
url={https://colah.github.io/posts/2015-01-Visualizing-Representations/},
}
@misc{universal_prior,
author={Paul Christiano},
title={What does the universal prior actually look like?},
year=2016,
url={https://ordinaryideas.wordpress.com/2016/11/30/what-does-the-universal-prior-actually-look-like},
}
@misc{partial_agency,
author={Abram Demski},
title={{Partial Agency}},
year=2019,
url={https://www.alignmentforum.org/s/HeYtBkNbEe7wpjc6X},
}
@article{language_models,
author={Alec Radford and Jeffrey Wu and Rewon Child and David Luan and Dario Amodei and Ilya Sutskever},
title={{Language Models are Unsupervised Multitask Learners}},
journal={{OpenAI}},
year=2019,
url={https://cdn.openai.com/better-language-models/language_models_are_unsupervised_multitask_learners.pdf},
}
@article{human_models,
author={Ramana Kumar and Scott Garrabrant},
title={Thoughts on Human Models},
journal={MIRI},
year=2019,
url={https://intelligence.org/2019/02/22/thoughts-on-human-models},
}
@article{theorem_proving,
author={Mitsuru Kusumoto and Keisuke Yahata and Masahiro Sakai},
title={{Automated Theorem Proving in Intuitionistic Propositional Logic by Deep Reinforcement Learning}},
journal={arXiv},
year=2018,
url={https://arxiv.org/abs/1811.00796},
}
@article{holist,
author={Kshitij Bansal and Sarah M. Loos and Markus N. Rabe and Christian Szegedy and Stewart Wilcox},
title={{HOList: An Environment for Machine Learning of Higher-Order Theorem Proving}},
journal={arXiv},
year=2019,
url={https://arxiv.org/abs/1904.03241},
}
@article{protein,
author={Andrew W. Senior and Richard Evans and John Jumper and James Kirkpatrick and Laurent Sifre and Tim Green and Chongli Qin and Augustin Žídek and Alexander W. R. Nelson and Alex Bridgland and Hugo Penedones and Stig Petersen and Karen Simonyan and Steve Crossan and Pushmeet Kohli and David T. Jones and David Silver and Koray Kavukcuoglu and Demis Hassabis},
title={Improved protein structure prediction using potentials from deep learning},
journal={Nature},
year=2020,
url={https://www.nature.com/articles/s41586-019-1923-7.epdf},
}
@article{vulnerable,
author={Nick Bostrom},
title={{The Vulnerable World Hypothesis}},
journal={{Global Policy}},
year=2019,
url={https://nickbostrom.com/papers/vulnerable.pdf},
}
@article{wbe,
author={Anders Sandberg and Nick Bostrom},
title={{Whole Brain Emulation: A Roadmap}},
journal={{FHI}},
year=2008,
url={https://www.fhi.ox.ac.uk/brain-emulation-roadmap-report.pdf},
}
@article{out_of_distribution,
author={Jie Ren and Peter J. Liu and Emily Fertig and Jasper Snoek and Ryan Poplin and Mark A. DePristo and Joshua V. Dillon and Balaji Lakshminarayanan},
title={{Likelihood Ratios for Out-of-Distribution Detection}},
journal={arXiv},
year=2019,
url={https://arxiv.org/abs/1906.02845},
}
@article{generative,
author={Jonathan Ho and Stefano Ermon},
title={{Generative Adversarial Imitation Learning}},
journal={arXiv},
year=2016,
url={https://arxiv.org/abs/1606.03476},
}
@article{learning_robust_rewards,
author={Justin Fu and Katie Luo and Sergey Levine},
title={{Learning Robust Rewards with Adversarial Inverse Reinforcement Learning}},
journal={arXiv},
year=2017,
url={https://arxiv.org/abs/1710.11248},
}
@article{reframing_si,
author={K. Eric Drexler},
title={{Reframing Superintelligence: Comprehensive AI Services as General Intelligence}},
journal={{FHI}},
year=2019,
url={https://www.fhi.ox.ac.uk/wp-content/uploads/Reframing_Superintelligence_FHI-TR-2019-1.1-1.pdf},
}
@misc{debate_progress,
author={Beth Barnes and Paul Christiano},
title={{Writeup: Progress on AI Safety via Debate}},
year=2020,
url={https://www.alignmentforum.org/posts/Br4xDbYu4Frwrb64a/writeup-progress-on-ai-safety-via-debate-1},
}
@article{go,
author={David Silver and Thomas Hubert and Julian Schrittwieser and Ioannis Antonoglou and Matthew Lai and Arthur Guez and Marc Lanctot and Laurent Sifre and Dharshan Kumaran and Thore Graepel and Timothy Lillicrap and Karen Simonyan and Demis Hassabis},
title={A general reinforcement learning algorithm that masters chess, shogi, and {Go} through self-play},
journal={Science},
year=2018,
url={https://science.sciencemag.org/content/362/6419/1140.full?ijkey=XGd77kI6W4rSc&keytype=ref&siteid=sci},
}
@misc{openai_five,
author={Filip Wolski and Szymon Sidor and Michael Petrov and David Farhi and Jonathan Raiman and Susan Zhang and Greg Brockman and Christy Dennison and Jie Tang and Henrique Pondé and Brooke Chan and Jakub Pachocki and Przemysław Dębiak},
title={{OpenAI Five}},
year=2018,
url={https://openai.com/blog/openai-five/},
}
@misc{alphastar,
author={{The AlphaStar team}},
title={{AlphaStar: Mastering the Real-Time Strategy Game StarCraft II}},
year=2019,
url={https://deepmind.com/blog/article/alphastar-mastering-real-time-strategy-game-starcraft-ii},
}
@misc{synthesizing,
author={Evan Hubinger},
title={Synthesizing amplification and debate},
year=2020,
url={https://www.alignmentforum.org/posts/dJSD5RK6Qoidb3QY5/synthesizing-amplification-and-debate},
}
%below are not actually cited in this paper
@misc{bottle_caps,
author={Daniel Filan},
title={Bottle Caps Aren't Optimisers},
year=2018,
url={http://danielfilan.com/2018/08/31/bottle_caps_arent_optimisers.html},
}
@article{treeqn,
title={{TreeQN} and {ATreeC}: Differentiable Tree-Structured Models for Deep Reinforcement Learning},
author={Farquhar, Gregory and Rockt{\"a}schel, Tim and Igl, Maximilian and Whiteson, Shimon},
journal={ICLR 2018},
year=2018,
url={https://arxiv.org/abs/1710.11417},
}
@article{univ_plan_net,
title={Universal Planning Networks},
author={Aravind Srinivas and Allan Jabri and Pieter Abbeel and Sergey Levine and Chelsea Finn},
journal={ICML 2018},
year=2018,
url={https://arxiv.org/abs/1804.00645},
}
@article{grad_by_grad,
title={Learning to learn by gradient descent by gradient descent},
author={Marcin Andrychowicz and Misha Denil and Sergio Gomez and Matthew W. Hoffman and David Pfau and Tom Schaul and Brendan Shillingford and Nando de Freitas},
journal={NIPS 2016},
year=2016,
url={https://arxiv.org/abs/1606.04474},
}
@article{rl2,
author={Yan Duan and John Schulman and Xi Chen and Peter L. Bartlett and Ilya Sutskever and Pieter Abbeel},
title={{RL}$^2$: Fast Reinforcement Learning via Slow Reinforcement Learning},
journal={arXiv},
year=2016,
url={https://arxiv.org/abs/1611.02779},
}
@misc{optpow,
author={Eliezer Yudkowsky},
title={Measuring Optimization Power},
year=2008,
url={https://www.lesswrong.com/posts/Q4hLMDrFd8fbteeZ8/measuring-optimization-power},
}
@article{alphazero,
author={Silver, David and Hubert, Thomas and Schrittwieser, Julian and Antonoglou, Ioannis and Lai, Matthew and Guez, Arthur and Lanctot, Marc and Sifre, Laurent and Kumaran, Dharshan and Graepel, Thore and Lillicrap, Timothy and Simonyan, Karen and Hassabis, Demis},
title={A general reinforcement learning algorithm that masters chess, shogi, and Go through self-play},
journal={Science},
volume=362,
number=6419,
pages={1140--1144},
year=2018,
url={https://science.sciencemag.org/content/362/6419/1140.full},
}
@article{drexler,
author={K. E. Drexler},
title={Reframing Superintelligence: Comprehensive AI Services as General Intelligence},
journal={Technical Report \#2019-1, Future of Humanity Institute, University of Oxford},
year=2019,
url={https://www.fhi.ox.ac.uk/wp-content/uploads/Reframing_Superintelligence_FHI-TR-2019-1.1-1.pdf},
}
@misc{paul_solomonoff,
author={Paul Christiano},
title={What does the universal prior actually look like?},
year=2016,
url={https://ordinaryideas.wordpress.com/2016/11/30/what-does-the-universal-prior-actually-look-like},
}
@article{neural_tms,
author={Alex Graves and Greg Wayne and Ivo Danihelka},
title={Neural Turing Machines},
journal={arXiv},
year=2014,
url={https://arxiv.org/abs/1410.5401},
}
@article{nn_simp_bias,
author={Guillermo Valle-Pérez and Chico Q. Camargo and Ard A. Louis},
title={Deep learning generalizes because the parameter-function map is biased towards simple functions},
journal={ICLR 2019},
year=2019,
url={https://arxiv.org/abs/1805.08522},
}
@misc{paul_minimal_circuits,
author={Paul Christiano},
title={Open question: are minimal circuits daemon-free?},
year=2018,
url={https://www.lesswrong.com/posts/nyCHnY7T5PHPLjxmN/open-question-are-`circuits-daemon-free},
}
@misc{chris,
author={Chris van Merwijk},
title={Development of {AI} agents as a principal-agent problem},
year={Forthcoming in 2019},
}
@article{ibarz,
author={Borja Ibarz and Jan Leike and Tobias Pohlen and Geoffrey Irving and Shane Legg and Dario Amodei},
title={Reward learning from human preferences and demonstrations in {Atari}},
journal={NeurIPS 2018},
year=2018,
url={https://arxiv.org/abs/1811.06521},
}
@article{adversarial_examples,
author={Jiawei Su and Danilo Vasconcellos Vargas and Kouichi Sakurai},
title={One pixel attack for fooling deep neural networks},
journal={IEEE Transactions on Evolutionary Computation},
year=2017,
url={http://arxiv.org/abs/1710.08864},
}
@article{irl_unidentifiability,
author={Kareem Amin and Satinder Singh},
title={Towards Resolving Unidentifiability in Inverse Reinforcement Learning},
journal={arXiv},
year=2016,
url={https://arxiv.org/abs/1601.06569},
}
@article{imagination_planners,
author={Razvan Pascanu and Yujia Li and Oriol Vinyals and Nicolas Heess and Lars Buesing and Sebastien Racanière and David Reichert and Théophane Weber and Daan Wierstra and Peter Battaglia},
title={Learning model-based planning from scratch},
journal={arXiv},
year=2017,
url={https://arxiv.org/abs/1707.06170},
}
@article{goodhart,
author={David Manheim and Scott Garrabrant},
title={Categorizing Variants of {Goodhart's} Law},
journal={arXiv},
year=2018,
url={https://arxiv.org/abs/1803.04585},
}
@misc{paul_doom,
author={Paul Christiano},
title={What failure looks like},
year=2019,
url={https://www.alignmentforum.org/posts/HBxe6wdjxK239zajf/more-realistic-tales-of-doom},
}
@article{corrigibility,
author={Nate Soares and Benja Fallenstein and Eliezer Yudkowsky and Stuart Armstrong},
title={Corrigibility},
journal={AAAI 2015},
year=2015,
url={https://intelligence.org/files/Corrigibility.pdf},
}
@misc{paul_robust_corrigibility,
author={Paul Christiano},
title={Worst-case guarantees},
year=2019,
url={https://ai-alignment.com/training-robust-corrigibility-ce0e0a3b9b4d},
}
@article{absent_minded_driver,
author={Robert J. Aumann and Sergiu Hart and Motty Perry},
title={The Absent-Minded Driver},
journal={Games and Economic Behavior},
volume=20,
pages={102--116},
year=1997,
url={http://www.ma.huji.ac.il/raumann/pdf/Minded\%20Driver.pdf},
}
@article{learn_to_rl,
author={Jane X Wang and Zeb Kurth-Nelson and Dhruva Tirumala and Hubert Soyer and Joel Z Leibo and Remi Munos and Charles Blundell and Dharshan Kumaran and Matt Botvinick},
title={Learning to reinforcement learn},
journal={CogSci},
year=2016,
url={https://arxiv.org/abs/1611.05763},
}
@article{concrete_problems,
author={Dario Amodei and Chris Olah and Jacob Steinhardt and Paul Christiano and John Schulman and Dan Mané},
title={Concrete Problems in {AI} Safety},
journal={arXiv},
year=2016,
url={https://arxiv.org/abs/1606.06565},
}
@article{armstrong_preferences,
author={Stuart Armstrong and Sören Mindermann},
title={Occam's razor is insufficient to infer the preferences of irrational agents},
journal={NeurIPS 2018},
year=2017,
url={https://arxiv.org/abs/1712.05812},
}
@article{safety_verification,
author={Xiaowei Huang and Marta Kwiatkowska and Sen Wang and Min Wu},
title={{Safety Verification of Deep Neural Networks}},
journal={CAV 2017},
year=2016,
url={https://arxiv.org/abs/1610.06940},
}
@article{reluplex,
author={Guy Katz and Clark Barrett and David Dill and Kyle Julian and Mykel Kochenderfer},
title={{Reluplex: An Efficient {SMT} Solver for Verifying Deep Neural Networks}},
journal={CAV 2017},
year=2017,
url={https://arxiv.org/abs/1702.01135},
}
@article{practical_verification,
author={Kexin Pei and Yinzhi Cao and Junfeng Yang and Suman Jana},
title={{Towards Practical Verification of Machine Learning: The Case of Computer Vision Systems}},
journal={arXiv},
year=2017,
url={https://arxiv.org/abs/1712.01785},
}
@misc{lesswrong_daemons,
author={Riceissa},
title={Optimization daemons},
year=2018,
url={https://wiki.lesswrong.com/wiki/Optimization_daemons},
}
@misc{induction-heads,
author={Catherine Olsson and Nelson Elhage and Neel Nanda and Nicholas Joseph and Nova DasSarma and Tom Henighan and
Ben Mann and Amanda Askell and Yuntao Bai and Anna Chen and Tom Conerly and Dawn Drain and Deep Ganguli and Zac Hatfield-Dodds and
Danny Hernandez and Scott Johnston and Andy Jones and Jackson Kernion and Liane Lovitt and Kamal Ndousse and Dario Amodei and Tom Brown and
Jack Clark and Jared Kaplan and Sam McCandlish and Chris Olah},
title={In-context Learning and Induction Heads},
year=2022,
url={https://transformer-circuits.pub/2022/in-context-learning-and-induction-heads/index.html},
}
@article{elk,
title = {Eliciting latent knowledge: How to tell if your eyes deceive you},
shorttitle = {Eliciting latent knowledge},
url = {https://www.alignmentforum.org/posts/qHCDysDnvhteW7kRd/arc-s-first-technical-report-eliciting-latent-knowledge},
abstract = {ARC has published a report on Eliciting Latent Knowledge, an open problem which we believe is central to alignment. We think reading this report is the clearest way to understand what problems we are…},
language = {en},
year = {2021},
author = {Paul Christiano and Mark Xu and Ajeya Cotra},
}
@article{simulators,
title = {Simulators},
url = {https://www.alignmentforum.org/posts/vJFdjigzmcXMhNTsx/simulators},
language = {en},
year = {2022},
author = {Janus},
}
@article{how_become_confident,
title = {How do we become confident in the safety of a machine learning system?},
url = {https://www.alignmentforum.org/posts/FDJnZt8Ks2djouQTZ/how-do-we-become-confident-in-the-safety-of-a-machine},
language = {en},
year=2021,
author = {Hubinger, Evan},
}
@misc{wikipedia_bayesian_network,
title = {Bayesian network},
url = {https://en.wikipedia.org/w/index.php?title=Bayesian_network&oldid=1134656115},
journal = {Wikipedia},
month = jan,
year = {2023},
author={Wikipedia}
}
@article{lms_multiverse_generators,
title = {Language models are multiverse generators},
url = {https://generative.ink/posts/language-models-are-multiverse-generators/},
language = {en},
year = {2021},
author = {Janus},
}
@misc{kl_penalty,
author={Tomek Korbak and Ethan Perez},
title={RL with KL penalties is better seen as Bayesian inference},
year=2022,
url={https://www.lesswrong.com/posts/eoHbneGvqDu25Hasc/rl-with-kl-penalties-is-better-seen-as-bayesian-inference},
}
@misc{multiple_worlds,
author={Evan Hubinger},
title={Multiple Worlds, One Universal Wave Function},
year=2020,
url={https://www.lesswrong.com/posts/2D9s6kpegDQtrueBE/multiple-worlds-one-universal-wave-function},
}
@techreport{deep_RL_human_pref,
title = {Deep reinforcement learning from human preferences},
url = {http://arxiv.org/abs/1706.03741},
abstract = {For sophisticated reinforcement learning (RL) systems to interact usefully with real-world environments, we need to communicate complex goals to these systems. In this work, we explore goals defined in terms of (non-expert) human preferences between pairs of trajectory segments. We show that this approach can effectively solve complex RL tasks without access to the reward function, including Atari games and simulated robot locomotion, while providing feedback on less than 1\% of our agent’s interactions with the environment. This reduces the cost of human oversight far enough that it can be practically applied to state-of-the-art RL systems. To demonstrate the flexibility of our approach, we show that we can successfully train complex novel behaviors with about an hour of human time. These behaviors and environments are considerably more complex than any which have been previously learned from human feedback.},
language = {en},
urldate = {2023-01-31},
institution = {arXiv},
author = {Christiano, Paul and Leike, Jan and Brown, Tom B. and Martic, Miljan and Legg, Shane and Amodei, Dario},
month = jul,
year = {2017},
}
@techreport{chain_of_thought,
title = {Chain-of-{Thought} {Prompting} {Elicits} {Reasoning} in {Large} {Language} {Models}},
url = {http://arxiv.org/abs/2201.11903},
abstract = {We explore how generating a chain of thought—a series of intermediate reasoning steps—significantly improves the ability of large language models to perform complex reasoning. In particular, we show how such reasoning abilities emerge naturally in sufficiently large language models via a simple method called chain-ofthought prompting, where a few chain of thought demonstrations are provided as exemplars in prompting. Experiments on three large language models show that chain-of-thought prompting improves performance on a range of arithmetic, commonsense, and symbolic reasoning tasks. The empirical gains can be striking. For instance, prompting a PaLM 540B with just eight chain-of-thought exemplars achieves state-of-the-art accuracy on the GSM8K benchmark of math word problems, surpassing even finetuned GPT-3 with a verifier.},
language = {en},
urldate = {2023-01-31},
institution = {arXiv},
author = {Wei, Jason and Wang, Xuezhi and Schuurmans, Dale and Bosma, Maarten and Ichter, Brian and Xia, Fei and Chi, Ed and Le, Quoc and Zhou, Denny},
month = jan,
year = {2023},
}
@misc{how_likely_deception,
author={Evan Hubinger},
title={How likely is deceptive alignment?},
year=2022,
url={https://www.alignmentforum.org/posts/A9NxPTwbw6r6Awuwt/how-likely-is-deceptive-alignment},
}
@misc{aligning_lms_follow_instructions,
title = {Aligning {Language} {Models} to {Follow} {Instructions}},
url = {https://openai.com/blog/instruction-following/},
author={Ryan Lowe and Jan Leike},
abstract = {We've trained language models that are much better at following user intentions
than GPT-3 while also making them more truthful and less toxic, using techniques
developed through our alignment research. These InstructGPT models, which are
trained with humans in the loop, are now deployed as the default language models},
language = {en},
urldate = {2023-01-31},
journal = {OpenAI},
month = jan,
year = {2022},
}
@misc{verification_not_easier,
author={John Wentworth},
title={Verification Is Not Easier Than Generation In General},
year=2022,
url={https://www.lesswrong.com/posts/2PDC69DDJuAx6GANa/verification-is-not-easier-than-generation-in-general},
}
@misc{janus_gpt_wrangling,
author={Scott Alexander},
title={Janus' GPT Wrangling},
year=2022,
url={https://astralcodexten.substack.com/p/janus-gpt-wrangling},
}
@misc{latent_adversarial_training,
author={Adam Jermyn},
title={Latent Adversarial Training},
year=2022,
url={https://www.alignmentforum.org/posts/atBQ3NHyqnBadrsGP/latent-adversarial-training},
}
@misc{fun_with_12_ooms,
author={Daniel Kokotajlo},
title={Fun with +12 OOMs of Compute},
year=2021,
url={https://www.alignmentforum.org/posts/rzqACeBGycZtqCfaX/fun-with-12-ooms-of-compute#Amp_GPT_7__},
}
@misc{conditioning_generative_models_with_restrictions,
author={Adam Jermyn},
title={Conditioning Generative Models with Restrictions},
year=2022,
url={https://www.alignmentforum.org/posts/adiszfnFgPEnRsGSr/conditioning-generative-models-with-restrictions},
}
@misc{strategy_for_conditioning,
author={James Lucassen and Evan Hubinger},
title={Strategy For Conditioning Generative Models},
year=2022,
url={https://www.alignmentforum.org/posts/HAz7apopTzozrqW2k/strategy-for-conditioning-generative-models},
}
@misc{conditioning_generative_models,
author={Adam Jermyn},
title={Conditioning Generative Models},
year=2022,
url={https://www.alignmentforum.org/posts/nXeLPcT9uhfG3TMPS/conditioning-generative-models},
}
@misc{factored_cognition,
title={Factored Cognition},
author={James Lucassen and Evan Hubinger},
url={https://www.alignmentforum.org/posts/HAz7apopTzozrqW2k/strategy-for-conditioning-generative-models},
year={2022}
}
@misc{training_goals_llms,
author={Johannes Treutlein},
title={Training goals for large language models},
year=2022,
url={https://www.alignmentforum.org/posts/dWJNFHnC4bkdbovug/training-goals-for-large-language-models},
}
@misc{proper_scoring_rules_dont_guarantee,
author={Johannes Treutlein and Rubi J. Hudson and Caspar Oesterheld},
title={Proper scoring rules don't guarantee predicting fixed points},
year=2022,
url={https://www.alignmentforum.org/posts/Aufg88v7mQ2RuEXkS/proper-scoring-rules-don-t-guarantee-predicting-fixed-points},
}
@misc{underspecification_of_oracle_ai,
author={Rubi J. Hudson and Adam Jermyn and Johannes Treutlein},
title={Underspecification of Oracle AI},
year=2023,
url={https://www.alignmentforum.org/posts/aBRS3x4sPSJ9G6xkj/underspecification-of-oracle-ai},
}
@article{armstrong_good_2017,
title = {Good and safe uses of {AI} {Oracles}},
url = {https://arxiv.org/abs/1711.05541v5},
abstract = {It is possible that powerful and potentially dangerous artificial intelligence (AI) might be developed in the future. An Oracle is a design which aims to restrain the impact of a potentially dangerous AI by restricting the agent to no actions besides answering questions. Unfortunately, most Oracles will be motivated to gain more control over the world by manipulating users through the content of their answers, and Oracles of potentially high intelligence might be very successful at this {\textbackslash}citep\{DBLP:journals/corr/AlfonsecaCACAR16\}. In this paper we present two designs for Oracles which, even under pessimistic assumptions, will not manipulate their users into releasing them and yet will still be incentivised to provide their users with helpful answers. The first design is the counterfactual Oracle -- which choses its answer as if it expected nobody to ever read it. The second design is the low-bandwidth Oracle -- which is limited by the quantity of information it can transmit.},
language = {en},
urldate = {2023-01-31},
author = {Armstrong, Stuart and O'Rorke, Xavier},
month = nov,
year = {2017},
}
@incollection{weirich_causal_2020,
edition = {Winter 2020},
title = {Causal {Decision} {Theory}},
url = {https://plato.stanford.edu/archives/win2020/entries/decision-causal/},
abstract = {Causal decision theory adopts principles of rational choice thatattend to an act’s consequences. It maintains that an account ofrational choice must use causality to identify the considerations thatmake a choice rational., Given a set of options constituting a decision problem, decisiontheory recommends an option that maximizes utility, that is, an optionwhose utility equals or exceeds the utility of every other option. Itevaluates an option’s utility by calculating the option’sexpected utility. It uses probabilities and utilities of anoption’s possible outcomes to define an option’s expectedutility. The probabilities depend on the option. Causal decisiontheory takes the dependence to be causal rather than merelyevidential., This essay explains causal decision theory, reviews its history,describes current research in causal decision theory, and surveys thetheory’s philosophical foundations. The literature on causaldecision theory is vast, and this essay covers only a portion ofit.},
urldate = {2023-01-31},
booktitle = {The {Stanford} {Encyclopedia} of {Philosophy}},
publisher = {Metaphysics Research Lab, Stanford University},
author = {Weirich, Paul},
editor = {Zalta, Edward N.},
year = {2020},
}
@misc{lcdt,
author={Adam Shimi and Evan Hubinger},
title={LCDT, A Myopic Decision Theory},
year=2021,
url={https://www.alignmentforum.org/posts/Y76durQHrfqwgwM5o/lcdt-a-myopic-decision-theory},
}
@misc{intuitive_solomonoff_induction,
author={Alex Altair},
title={An Intuitive Explanation of Solomonoff Induction},
year=2012,
url={https://www.lesswrong.com/posts/Kyc5dFDzBg4WccrbK/an-intuitive-explanation-of-solomonoff-induction},
}
@misc{paul_christiano_current_work,
author={Paul Christiano},
title={Current work in AI alignment},
year=2020,
note={EA Global},
url={https://forum.effectivealtruism.org/posts/63stBTw3WAW6k45dY/paul-christiano-current-work-in-ai-alignment
},}
@techreport{bai_training_2022,
title = {Training a {Helpful} and {Harmless} {Assistant} with {Reinforcement} {Learning} from {Human} {Feedback}},
url = {http://arxiv.org/abs/2204.05862},
abstract = {We apply preference modeling and reinforcement learning from human feedback (RLHF) to finetune language models to act as helpful and harmless assistants. We find this alignment training improves performance on almost all NLP evaluations, and is fully compatible with training for specialized skills such as python coding and summarization. We explore an iterated online mode of training, where preference models and RL policies are updated on a weekly cadence with fresh human feedback data, efficiently improving our datasets and models. Finally, we investigate the robustness of RLHF training, and identify a roughly linear relation between the RL reward and the square root of the KL divergence between the policy and its initialization. Alongside our main results, we perform peripheral analyses on calibration, competing objectives, and the use of OOD detection, compare our models with human writers, and provide samples from our models using prompts appearing in recent related work.},
language = {en},
urldate = {2023-01-31},
institution = {arXiv},
author = {Bai, Yuntao and Jones, Andy and Ndousse, Kamal and Askell, Amanda and Chen, Anna and DasSarma, Nova and Drain, Dawn and Fort, Stanislav and Ganguli, Deep and Henighan, Tom and Joseph, Nicholas and Kadavath, Saurav and Kernion, Jackson and Conerly, Tom and El-Showk, Sheer and Elhage, Nelson and Hatfield-Dodds, Zac and Hernandez, Danny and Hume, Tristan and Johnston, Scott and Kravec, Shauna and Lovitt, Liane and Nanda, Neel and Olsson, Catherine and Amodei, Dario and Brown, Tom and Clark, Jack and McCandlish, Sam and Olah, Chris and Mann, Ben and Kaplan, Jared},
month = apr,
year = {2022},
}
@misc{conditioning_prompts_fine_tuning,
author={Adam Jermyn},
title={Conditioning, Prompts, and Fine-Tuning},
year=2022,
url={https://www.alignmentforum.org/posts/chevXfQmRYrTZnj8r/conditioning-prompts-and-fine-tuning
},
}
@techreport{zou_forecasting_2022,
title = {Forecasting {Future} {World} {Events} with {Neural} {Networks}},
url = {http://arxiv.org/abs/2206.15474},
abstract = {Forecasting future world events is a challenging but valuable task. Forecasts of climate, geopolitical conflict, pandemics and economic indicators help shape policy and decision making. In these domains, the judgment of expert humans contributes to the best forecasts. Given advances in language modeling, can these forecasts be automated? To this end, we introduce Autocast, a dataset containing thousands of forecasting questions and an accompanying news corpus. Questions are taken from forecasting tournaments, ensuring high quality, real-world importance, and diversity. The news corpus is organized by date, allowing us to precisely simulate the conditions under which humans made past forecasts (avoiding leakage from the future). Motivated by the difficulty of forecasting numbers across orders of magnitude (e.g. global cases of COVID-19 in 2022), we also curate IntervalQA, a dataset of numerical questions and metrics for calibration. We test language models on our forecasting task and find that performance is far below a human expert baseline. However, performance improves with increased model size and incorporation of relevant information from the news corpus. In sum, Autocast poses a novel challenge for large language models and improved performance could bring large practical benefits.},
language = {en},
urldate = {2023-01-31},
institution = {arXiv},
author = {Zou, Andy and Xiao, Tristan and Jia, Ryan and Kwon, Joe and Mazeika, Mantas and Li, Richard and Song, Dawn and Steinhardt, Jacob and Evans, Owain and Hendrycks, Dan},
month = oct,
year = {2022},
}
@techreport{kojima_large_2023,
title = {Large {Language} {Models} are {Zero}-{Shot} {Reasoners}},
url = {http://arxiv.org/abs/2205.11916},
abstract = {Pretrained large language models (LLMs) are widely used in many sub-fields of natural language processing (NLP) and generally known as excellent few-shot learners with task-specific exemplars. Notably, chain of thought (CoT) prompting, a recent technique for eliciting complex multi-step reasoning through step-bystep answer examples, achieved the state-of-the-art performances in arithmetics and symbolic reasoning, difficult system-2 tasks that do not follow the standard scaling laws for LLMs. While these successes are often attributed to LLMs’ ability for few-shot learning, we show that LLMs are decent zero-shot reasoners by simply adding “Let’s think step by step” before each answer. Experimental results demonstrate that our Zero-shot-CoT, using the same single prompt template, significantly outperforms zero-shot LLM performances on diverse benchmark reasoning tasks including arithmetics (MultiArith, GSM8K, AQUA-RAT, SVAMP), symbolic reasoning (Last Letter, Coin Flip), and other logical reasoning tasks (Date Understanding, Tracking Shuffled Objects), without any hand-crafted few-shot examples, e.g. increasing the accuracy on MultiArith from 17.7\% to 78.7\% and GSM8K from 10.4\% to 40.7\% with large-scale InstructGPT model (text-davinci002), as well as similar magnitudes of improvements with another off-the-shelf large model, 540B parameter PaLM. The versatility of this single prompt across very diverse reasoning tasks hints at untapped and understudied fundamental zero-shot capabilities of LLMs, suggesting high-level, multi-task broad cognitive capabilities may be extracted by simple prompting. We hope our work not only serves as the minimal strongest zero-shot baseline for the challenging reasoning benchmarks, but also highlights the importance of carefully exploring and analyzing the enormous zero-shot knowledge hidden inside LLMs before crafting finetuning datasets or few-shot exemplars.},
language = {en},
urldate = {2023-01-31},
institution = {arXiv},
author = {Kojima, Takeshi and Gu, Shixiang Shane and Reid, Machel and Matsuo, Yutaka and Iwasawa, Yusuke},
month = jan,
year = {2023},
}
@misc{why_ai_alignment_hard,
author={Ajeya Cotra},
title={Why AI alignment could be hard with modern deep learning},
year=2021,
url={https://www.cold-takes.com/why-ai-alignment-could-be-hard-with-modern-deep-learning/
},
}
@misc{path_dependence_ml_inductive_biases,
author={Vivek Hebbar and Evan Hubinger},
title={Path dependence in ML inductive biases},
year=2022,
url={https://www.alignmentforum.org/posts/bxkWd6WdkPqGmdHEk/path-dependence-in-ml-inductive-biases
},
}
@misc{tiling_agents_self_modifying_ai,
author={Yudkowsky, Eliezer and Herreshoff, Marcello},
title={Tiling Agents for Self-Modifying AI, and the Löbian Obstacle},
year=2013,
url={http://intelligence.org/files/TilingAgentsDraft.pdf
},
}
@misc{attempts_at_forwarding_speed_priors,
author={James Lucassen and Evan Hubinger},
title={Attempts at Forwarding Speed Priors},
year=2022,
url={https://www.alignmentforum.org/posts/bzkCWEHG2tprB3eq2/attempts-at-forwarding-speed-priors
},
}
@misc{agents_over_cartesian_world_models,
author={Mark Xu and Evan Hubinger},
title={Agents Over Cartesian World Models},
year=2021,
url={https://www.alignmentforum.org/posts/LBNjeGaJZw7QdybMw/agents-over-cartesian-world-models
},
}
@misc{a_transparency_and_interpretability_tech_tree,
author={Evan Hubinger},
title={A transparency and interpretability tech tree},
year=2022,
url={https://www.alignmentforum.org/posts/nbq2bWLcYmSGup9aF/a-transparency-and-interpretability-tech-tree
},
}
@techreport{gao_scaling_2022,
title = {Scaling {Laws} for {Reward} {Model} {Overoptimization}},
url = {http://arxiv.org/abs/2210.10760},
abstract = {In reinforcement learning from human feedback, it is common to optimize against a reward model trained to predict human preferences. Because the reward model is an imperfect proxy, optimizing its value too much can hinder ground truth performance, in accordance with Goodhart’s law. This effect has been frequently observed, but not carefully measured due to the expense of collecting human preference data. In this work, we use a synthetic setup in which a fixed “goldstandard” reward model plays the role of humans, providing labels used to train a proxy reward model. We study how the gold reward model score changes as we optimize against the proxy reward model using either reinforcement learning or best-of-n sampling. We find that this relationship follows a different functional form depending on the method of optimization, and that in both cases its coefficients scale smoothly with the number of reward model parameters. We also study the effect on this relationship of the size of the reward model dataset, the number of reward model and policy parameters, and the coefficient of the KL penalty added to the reward in the reinforcement learning setup. We explore the implications of these empirical results for theoretical considerations in AI alignment.},
language = {en},
urldate = {2023-01-31},
institution = {arXiv},
author = {Gao, Leo and Schulman, John and Hilton, Jacob},
month = oct,
year = {2022},
}
@misc{mysteries_of_mode_collapse,
author={Janus},
title={Mysteries of mode collapse},
year=2022,
url={https://www.lesswrong.com/posts/t9svvNPNmFf5Qa3TA/mysteries-of-mode-collapse-due-to-rlhf},
}
@misc{cross_entropy,
title = {Cross entropy},
copyright = {Creative Commons Attribution-ShareAlike License},
url = {https://en.wikipedia.org/w/index.php?title=Cross_entropy},
language = {en},
urldate = {2023-01-31},
journal = {Wikipedia},
month = jan,
year = {2023},
}
@misc{kl_divergence,
title = {Kullback-{Leibler} divergence},
copyright = {Creative Commons Attribution-ShareAlike License},
url = {https://en.wikipedia.org/w/index.php?title=Kullback%E2%80%93Leibler_divergence},
urldate = {2023-01-31},
journal = {Wikipedia},
author={Wikipedia},
month = jan,
year = {2023},
}
@techreport{chen_decision_2021,
title = {Decision {Transformer}: {Reinforcement} {Learning} via {Sequence} {Modeling}},
shorttitle = {Decision {Transformer}},
url = {http://arxiv.org/abs/2106.01345},
abstract = {We introduce a framework that abstracts Reinforcement Learning (RL) as a sequence modeling problem. This allows us to draw upon the simplicity and scalability of the Transformer architecture, and associated advances in language modeling such as GPT-x and BERT. In particular, we present Decision Transformer, an architecture that casts the problem of RL as conditional sequence modeling. Unlike prior approaches to RL that fit value functions or compute policy gradients, Decision Transformer simply outputs the optimal actions by leveraging a causally masked Transformer. By conditioning an autoregressive model on the desired return (reward), past states, and actions, our Decision Transformer model can generate future actions that achieve the desired return. Despite its simplicity, Decision Transformer matches or exceeds the performance of state-of-the-art model-free offline RL baselines on Atari, OpenAI Gym, and Key-to-Door tasks.},
language = {en},
urldate = {2023-01-31},
institution = {arXiv},
author = {Chen, Lili and Lu, Kevin and Rajeswaran, Aravind and Lee, Kimin and Grover, Aditya and Laskin, Michael and Abbeel, Pieter and Srinivas, Aravind and Mordatch, Igor},
month = jun,
year = {2021},
}
@inproceedings{taylor_quantilizers_2016,
title = {Quantilizers: {A} {Safer} {Alternative} to {Maximizers} for {Limited} {Optimization}},
shorttitle = {Quantilizers},
url = {https://intelligence.org/files/QuantilizersSaferAlternative.pdf},
abstract = {In the field of AI, expected utility maximizers are commonly used as a model for idealized agents. However, expected utility maximization can lead to unintended solutions when the utility function does not quantify everything the operators care about: imagine, for example, an expected utility maximizer tasked with winning money on the stock market, which has no regard for whether it accidentally causes a market crash. Once AI systems become sufficiently intelligent and powerful, these unintended solutions could become quite dangerous. In this paper, we describe an alternative to expected utility maximization for powerful AI systems, which we call expected utility quantilization. This could allow the construction of AI systems that do not necessarily fall into strange and unanticipated shortcuts and edge cases in pursuit of their goals.},
author = {Taylor, Jessica},
month = mar,
year = {2016},
}
@misc{homogeneity_vs_heterogeneity,
author={Evan Hubinger},
title={Homogeneity vs. heterogeneity in AI takeoff scenarios},
year=2020,
url={https://www.alignmentforum.org/posts/mKBfa8v4S9pNKSyKK/homogeneity-vs-heterogeneity-in-ai-takeoff-scenarios},
}
@misc{monitoring_for_deceptive_alignment,
author={Evan Hubinger},
title={Monitoring for deceptive alignment},
year=2022,
url={https://www.lesswrong.com/posts/Km9sHjHTsBdbgwKyi/monitoring-for-deceptive-alignment
},
}
@misc{minimal_viable_product,
author={Jan Leike},
title={A minimal viable product for alignment},
year=2022,
url={www.alignmentforum.org/posts/fYf9JAwa6BYMt8GBj/link-a-minimal-viable-product-for-alignment
},
}
@techreport{perez_discovering_2022,
title = {Discovering Language Model Behaviors with Model-Written Evaluations},
url = {http://arxiv.org/abs/2212.09251},
abstract = {As language models (LMs) scale, they develop many novel behaviors, good and bad, exacerbating the need to evaluate how they behave. Prior work creates evaluations with crowdwork (which is time-consuming and expensive) or existing data sources (which are not always available). Here, we automatically generate evaluations with LMs. We explore approaches with varying amounts of human effort, from instructing LMs to write yes/no questions to making complex Winogender schemas with multiple stages of LM-based generation and filtering. Crowdworkers rate the examples as highly relevant and agree with 90-100\% of labels, sometimes more so than corresponding human-written datasets. We generate 154 datasets and discover new cases of inverse scaling where LMs get worse with size. Larger LMs repeat back a dialog user’s preferred answer (“sycophancy”) and express greater desire to pursue concerning goals like resource acquisition and goal preservation. We also find some of the first examples of inverse scaling in RL from Human Feedback (RLHF), where more RLHF makes LMs worse. For example, RLHF makes LMs express stronger political views (on gun rights and immigration) and a greater desire to avoid shut down. Overall, LM-written evaluations are highquality and let us quickly discover many novel LM behaviors.},
language = {en},
urldate = {2023-01-31},
institution = {arXiv},