papers + undergrads

swabhs · swabhs · commit 7f126d8490e9 · 2025-11-06T18:16:36.000-08:00
diff --git a/_bibliography/papers.bib b/_bibliography/papers.bib
@@ -1,6 +1,39 @@
 ---
 ---
 
+@article{finlayson2025languagemodelforgeryresistantsignature,
+      title={Every Language Model Has a Forgery-Resistant Signature},
+      author={Matthew Finlayson and Xiang Ren and Swabha Swayamdipta},
+      year={2025},
+      journal={Under Review},
+      abbr={arXiv},
+      url={https://arxiv.org/abs/2510.14086},
+      code={},
+      abstract={The ubiquity of closed-weight language models with public-facing APIs has generated interest in forensic methods, both for extracting hidden model details (e.g., parameters) and for identifying models by their outputs. One successful approach to these goals has been to exploit the geometric constraints imposed by the language model architecture and parameters. In this work, we show that a lesser-known geometric constraint--namely, that language model outputs lie on the surface of a high-dimensional ellipse--functions as a signature for the model and can be used to identify the source model of a given output. This ellipse signature has unique properties that distinguish it from existing model-output association methods like language model fingerprints. In particular, the signature is hard to forge: without direct access to model parameters, it is practically infeasible to produce log-probabilities (logprobs) on the ellipse. Secondly, the signature is naturally occurring, since all language models have these elliptical constraints. Thirdly, the signature is self-contained, in that it is detectable without access to the model inputs or the full weights. Finally, the signature is compact and redundant, as it is independently detectable in each logprob output from the model. We evaluate a novel technique for extracting the ellipse from small models and discuss the practical hurdles that make it infeasible for production-scale models. Finally, we use ellipse signatures to propose a protocol for language model output verification, analogous to cryptographic symmetric-key message authentication systems.},
+}
+
+@article{yauney2025reliablelanguagemodelmicrobenchmarking,
+      title={How Reliable is Language Model Micro-Benchmarking?},
+      author={Gregory Yauney and Shahzaib Saqib Warraich and Swabha Swayamdipta},
+      year={2025},
+      journal={Under Review},
+      abbr={arXiv},
+      code={https://github.com/dill-lab/micro-benchmarking-reliability},
+      url={https://arxiv.org/abs/2510.08730},
+      abstract={Micro-benchmarking offers a solution to the often prohibitive time and cost of language model development: evaluate on a very small subset of existing benchmarks. Can these micro-benchmarks, however, rank models as consistently as the full benchmarks they replace? And can they rank models more consistently than selecting a random subset of data points? In many scenarios, we find that the answer is no. We introduce a meta-evaluation measure for micro-benchmarking which investigates how well a micro-benchmark can rank two models as a function of their performance difference on the full benchmark. This approach can determine which model pairs can be ranked correctly by a micro-benchmark, allowing for a finer-grained analysis of the trade-off between micro-benchmark size and reliability. Prior work has suggested selecting as few as 10 examples; we find that no micro-benchmarking method can consistently rank model pairs 3.5 points of accuracy apart on MMLU-Pro or 4 points apart on BIG-bench Hard. In order to consistently rank model pairs with relatively similar performances, we show that often as many as 250 examples must be selected, at which point random sampling is competitive with existing micro-benchmarking methods. When comparing only 8B instruction-tuned models on MMLU-Pro micro-benchmarks with 25 examples, we find that more than half of pairwise comparisons are not likely to be preserved. Our work provides actionable guidance for both micro-benchmark users and developers in navigating the trade-off between evaluation efficiency and reliability.},
+}
+
+@article{he2025believingseeingqualityscores,
+      title={Believing without Seeing: Quality Scores for Contextualizing Vision-Language Model Explanations},
+      author={Keyu He and Tejas Srinivasan and Brihi Joshi and Xiang Ren and Jesse Thomason and Swabha Swayamdipta},
+      year={2025},
+      journal={Under Review},
+      abbr={arXiv},
+      url={https://arxiv.org/abs/2509.25844},
+      code={},
+      abstract={When people query Vision-Language Models (VLMs) but cannot see the accompanying visual context (e.g. for blind and low-vision users), augmenting VLM predictions with natural language explanations can signal which model predictions are reliable. However, prior work has found that explanations can easily convince users that inaccurate VLM predictions are correct. To remedy undesirable overreliance on VLM predictions, we propose evaluating two complementary qualities of VLM-generated explanations via two quality scoring functions. We propose Visual Fidelity, which captures how faithful an explanation is to the visual context, and Contrastiveness, which captures how well the explanation identifies visual details that distinguish the model's prediction from plausible alternatives. On the A-OKVQA and VizWiz tasks, these quality scoring functions are better calibrated with model correctness than existing explanation qualities. We conduct a user study in which participants have to decide whether a VLM prediction is accurate without viewing its visual context. We observe that showing our quality scores alongside VLM explanations improves participants' accuracy at predicting VLM correctness by 11.1%, including a 15.4% reduction in the rate of falsely believing incorrect predictions. These findings highlight the utility of explanation quality scores in fostering appropriate reliance on VLM predictions.},
+}
+
 @article{nazir2025betterlanguagemodelinversion,
       title={{Better Language Model Inversion by Compactly Representing Next-Token Distributions}},
       author={Murtaza Nazir and Matthew Finlayson and John X. Morris and Xiang Ren and Swabha Swayamdipta},
@@ -9,6 +42,7 @@ @article{nazir2025betterlanguagemodelinversion
       abbr={NeurIPS},
       journal={Proc. of NeurIPS},
       url={https://arxiv.org/abs/2506.17090},
+      code={},
       abstract={{Language model inversion seeks to recover hidden prompts using only language model outputs. This capability has implications for security and accountability in language model deployments, such as leaking private information from an API-protected language model's system message. We propose a new method -- prompt inversion from logprob sequences (PILS) -- that recovers hidden prompts by gleaning clues from the model's next-token probabilities over the course of multiple generation steps. Our method is enabled by a key insight: The vector-valued outputs of a language model occupy a low-dimensional subspace. This enables us to losslessly compress the full next-token probability distribution over multiple generation steps using a linear map, allowing more output information to be used for inversion. Our approach yields massive gains over previous state-of-the-art methods for recovering hidden prompts, achieving 2--3.5 times higher exact recovery rates across test sets, in one case increasing the recovery rate from 17% to 60%. Our method also exhibits surprisingly good generalization behavior; for instance, an inverter trained on 16 generations steps gets 5--27 points higher prompt recovery when we increase the number of steps to 32 at test time. Furthermore, we demonstrate strong performance of our method on the more challenging task of recovering hidden system messages. We also analyze the role of verbatim repetition in prompt recovery and propose a new method for cross-family model transfer for logit-based inverters. Our findings show that next-token probabilities are a considerably more vulnerable attack surface for inversion attacks than previously known.}}
 }
 
@@ -45,7 +79,7 @@ @article{liu2025evaluationimperfectbenchmarksratings
 }
 
 @article{ranjit2025nvdrs,
-      title={Designing and Validating Intervention Opportunities for Suicide Prevention with Language Model Assistants},
+      title={Uncovering Intervention Opportunities for Suicide Prevention with Language Model Assistants},
       author={Jaspreet Ranjit and Hyundong J. Cho and Claire J. Smerdon and Yoonsoo Nam and Myles Phung and Jonathan May and John R. Blosnich and Swabha Swayamdipta},
       year={2025},
       journal={EAAMO / NeurIPS Workshop on GenAI for Health},
@@ -64,15 +98,6 @@ @article{surana2025chemref
       url={./},
 }
 
-@article{he2025believingseeingqualityscores,
-      title={Believing without Seeing: Quality Scores for Contextualizing Vision-Language Model Explanations},
-      author={Keyu He and Tejas Srinivasan and Brihi Joshi and Xiang Ren and Jesse Thomason and Swabha Swayamdipta},
-      year={2025},
-      journal={Under Review},
-      abbr={arXiv},
-      url={https://arxiv.org/abs/2509.25844},
-}
-
 @article{kulkarni2025evaluatingevaluationmetrics,
       title={Evaluating Evaluation Metrics -- The Mirage of Hallucination Detection},
       author={Atharva Kulkarni and Yuan Zhang and Joel Ruben Antony Moniz and Xiou Ge and Bo-Hsiang Tseng and Dhivya Piraviperumal and Swabha Swayamdipta and Hong Yu},
@@ -100,6 +125,7 @@ @article{cui2025robustdatawatermarkinglanguage
       journal={Findings of ACL},
       abbr={ACL},
       url={https://arxiv.org/abs/2503.04036},
+      code={https://github.com/dill-lab/Fictitious_Fact_Watermarks},
       abstract={Data watermarking in language models injects traceable signals, such as specific token sequences or stylistic patterns, into copyrighted text, allowing copyright holders to track and verify training data ownership. Previous data watermarking techniques primarily focus on effective memorization after pretraining, while overlooking challenges that arise in other stages of the LLM pipeline, such as the risk of watermark filtering during data preprocessing, or potential forgetting through post-training, or verification difficulties due to API-only access. We propose a novel data watermarking approach that injects coherent and plausible yet fictitious knowledge into training data using generated passages describing a fictitious entity and its associated attributes. Our watermarks are designed to be memorized by the LLM through seamlessly integrating in its training data, making them harder to detect lexically during preprocessing. We demonstrate that our watermarks can be effectively memorized by LLMs, and that increasing our watermarks' density, length, and diversity of attributes strengthens their memorization. We further show that our watermarks remain robust throughout LLM development, maintaining their effectiveness after continual pretraining and supervised finetuning. Finally, we show that our data watermarks can be evaluated even under API-only access via question answering.},
 }
 
diff --git a/_data/coauthors.yml b/_data/coauthors.yml
@@ -71,3 +71,7 @@
 "warraich":
   - firstname: ["Shahzaib", "Shahzaib Saqib"]
     url: "https://www.linkedin.com/in/shahzaib-saqib-warraich-348aa714a/"
+
+"yauney":
+  - firstname: ["Greg", "Gregory", "Gregory Yauney"]
+    url: "https://gyauney.github.io/"
diff --git a/_projects/undergrads/jason.md b/_projects/undergrads/jason.md
@@ -0,0 +1,8 @@
+---
+layout: page
+title: Jason (Jiasen) Liu
+description: Fall 2025 -
+img: /assets/img/dill-canva-transp.png
+importance: 7
+category: undergrad
+---
diff --git a/_projects/undergrads/mike.md b/_projects/undergrads/mike.md
@@ -0,0 +1,9 @@
+---
+layout: page
+title: Mike Gee
+description: Summer 2025 -
+img: /assets/img/people/mike.jpg
+importance: 3
+category: undergrad
+redirect: https://mpg05883.github.io/
+---
diff --git a/assets/img/people/mike.jpg b/assets/img/people/mike.jpg